This document contains all statistical analyses conducted for the manuscript.
All data to reproduce analysis can be found here: https://github.com/biocom-uib/CAUvsCYM
rm(list = ls())
knitr::opts_chunk$set(
fig.width=10,
out.width="50%",
fig.asp = 1,
fig.align="center",
echo = TRUE,
message = FALSE,
warning = FALSE,
hiline = TRUE,
cache=TRUE
)
options(knitr.kable.NA = '')
knitr::opts_knit$set(global.par=TRUE)
par(cex.main=0.9,cex.axis=0.8,cex.lab=0.8)
options(scipen = 999)
We load the necessary packages:
library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(knitr)
library(ggplot2)
library(grid)
library(tidyverse)
library(splines)
library(zCompositions)
library(compositions)
library(robCompositions)
library(easyCODA)
library(ALDEx2)
library(viridis)
library(plotly)
library(ComplexHeatmap)
library(stats)
library(dendextend)
library(RColorBrewer)
library(kableExtra)
library(vegan)
library(ecolTest)
library(factoextra)
library(coda4microbiome)
source("funcionsCODAMETACIRCLE.R")
opar=par()
library(flextable)
set_flextable_defaults(
font.family = "Arial", font.size = 10,
border.color = "gray", big.mark = "")
datos <- read_table("Sulfur_data_28ago25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
df_summary <- datos %>%
group_by(Time, Species, Treatment) %>%
summarise(
mean_sulfur = mean(Sulfur, na.rm = TRUE),
sd_sulfur = sd(Sulfur, na.rm = TRUE),
n = n(),
se_sulfur = sd_sulfur / sqrt(n)
) %>%
ungroup()
p_a<- ggplot() +
geom_jitter(data = datos,
aes(x = Time, y = Sulfur, color = Species,
shape = Treatment),
width = 0.1,
alpha = 0.4,
size = 1.2) +
geom_line(data = df_summary,
aes(x = Time,
y = mean_sulfur,
color = Species,
group = interaction(Species, Treatment),
linetype = Treatment),
size = 1) +
geom_point(data = df_summary,
aes(x = Time,
y = mean_sulfur,
color = Species,
shape = Treatment),
size = 3) +
geom_errorbar(data = df_summary,
aes(x = Time,
ymin = mean_sulfur - se_sulfur,
ymax = mean_sulfur + se_sulfur,
color = Species),
width = 0.1,
size = 1.2) +
scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
labs(
x = "Time",
y = expression(paste("Sulfur (", mu, "M)")),
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal() +
theme(axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(400, 1400)) +
scale_y_continuous(
breaks = seq(400, 1400, by = 100)
)
p_a
We present nonparametric, median-based analyses to assess sulfur concentration changes over time and between treatments. First, we summarize group medians and the change \(\Delta\)(T2–T0) for each Species × Treatment.
medians <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(median_sulfur = median(Sulfur, na.rm = TRUE), n = dplyr::n(), .groups = "drop") %>%
tidyr::pivot_wider(names_from = Time, values_from = median_sulfur) %>%
# Opción A: con comillas invertidas (recomendado)
mutate(delta_median = `T2` - `T0`)
# Opción B (alternativa robusta):
# mutate(delta_median = get("T2") - get("T0"))
knitr::kable(medians, digits = 2, caption = "Medians by group and change (T2 - T0)")
| Species | Treatment | n | T0 | T2 | delta_median |
|---|---|---|---|---|---|
| CAU | Control | 9 | 945.91 | 1151.37 | 205.46 |
| CAU | HW | 9 | 939.62 | 1284.93 | 345.31 |
| CYM | Control | 9 | 785.53 | 894.56 | 109.03 |
| CYM | HW | 9 | 835.85 | 1109.06 | 273.21 |
We then test within-group change (T0 vs T2) and between-treatment differences at T2 using Wilcoxon rank-sum/signed-rank tests (paired if the same units were followed, otherwise unpaired), applying Benjamini–Hochberg adjustment to control the false discovery rate.
time_tests <- datos %>%
dplyr::group_by(Species, Treatment) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
x <- d$Sulfur[d$Time == "T0"]
y <- d$Sulfur[d$Time == "T2"]
if (length(x) > 0 && length(y) > 0) {
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
} else {
NA_real_
}
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(time_tests, digits = 3,
caption = "Wilcoxon (T0 vs T2) per Species × Treatment (BH-adjusted p)")
| Species | Treatment | p | p_adj |
|---|---|---|---|
| CAU | Control | 0.008 | 0.011 |
| CAU | HW | 0.001 | 0.002 |
| CYM | Control | 0.158 | 0.158 |
| CYM | HW | 0.004 | 0.007 |
These outputs provide a basis for the statement that sulfur increased in both species and that the rise was more pronounced under heatwave conditions—particularly for Cymodocea at T2.
datos <- read_table("redox_data_12feb25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
df_summary <- datos %>%
group_by(Time, Species, Treatment) %>%
summarise(
mean_redox = mean(redox, na.rm = TRUE),
sd_redox = sd(redox, na.rm = TRUE),
n = n(),
se_redox = sd_redox / sqrt(n)
) %>%
ungroup()
p_b <- ggplot() +
geom_jitter(data = datos,
aes(x = Time, y = redox, color = Species, shape = Treatment),
width = 0.1,
alpha = 0.4,
size = 1.2) +
geom_line(data = df_summary,
aes(x = Time,
y = mean_redox,
color = Species,
group = interaction(Species, Treatment),
linetype = Treatment),
size = 1) +
geom_point(data = df_summary,
aes(x = Time,
y = mean_redox,
color = Species,
shape = Treatment),
size = 3) +
geom_errorbar(data = df_summary,
aes(x = Time,
ymin = mean_redox - se_redox,
ymax = mean_redox + se_redox,
color = Species),
width = 0.1,
size = 1.2) +
scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
labs(
x = "Time",
y = expression(paste("redox (",m, "M)")) ,
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal() +
theme(axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(-430, -330)) +
scale_y_continuous(
breaks = seq(-430, -330, by = 10)
)
p_b
We present a nonparametric analysis of sediment redox potential. First, we assess within-group change from T0 to T2 for each Species × Treatment using Wilcoxon tests and compare treatments at T2 within each species. P-values are adjusted for multiple testing with the Benjamini–Hochberg procedure
eh_within <- datos %>%
dplyr::group_by(Species, Treatment) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
x <- d$redox[d$Time == "T0"]
y <- d$redox[d$Time == "T2"]
if (length(x) > 0 && length(y) > 0) {
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
} else NA_real_
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(eh_within, digits = 3,
caption = "Wilcoxon (T0 vs T2) by Species × Treatment (BH-adjusted p)")
| Species | Treatment | p | p_adj |
|---|---|---|---|
| CAU | Control | 0.005 | 0.020 |
| CAU | HW | 0.575 | 0.575 |
| CYM | Control | 0.298 | 0.397 |
| CYM | HW | 0.045 | 0.091 |
Next, we assess differences between treatments in T2 by species (Control vs Heatwave)
eh_t2_bt <- datos %>%
dplyr::filter(Time == "T2") %>%
dplyr::group_by(Species) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
if (dplyr::n_distinct(d$Treatment) >= 2) {
suppressWarnings(stats::wilcox.test(redox ~ Treatment, data = d, exact = FALSE)$p.value)
} else NA_real_
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(eh_t2_bt, digits = 3,
caption = "Wilcoxon (Treatment) at T2 by Species (BH-adjusted p)")
| Species | p | p_adj |
|---|---|---|
| CAU | 0.298 | 0.298 |
| CYM | 0.045 | 0.091 |
datos <- read_table("cell_shanon_data_12_05_25.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
cells = as.numeric(cells)
)
summary_data <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(
mean_cells = mean(cells, na.rm = TRUE),
sd = sd(cells, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
.groups = "drop"
)
t1_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T1") %>%
select(Species, y = mean_cells)
t2_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T2") %>%
select(Species, yend = mean_cells)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")
t12 <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_cells - t12$se, t12$mean_cells + t12$se)
ymin12 <- rango12[1] - 0.025 * diff(rango12)
ymax12 <- rango12[2] + 0.025 * diff(rango12)
p_c<- ggplot() +
geom_jitter(
data = datos,
aes(x = Time, y = cells, color = Species, shape = Treatment),
width = 0.1, alpha = 0.4, size = 1.2
) +
geom_line(
data = summary_data,
aes(x = Time, y = mean_cells,
color = Species,
linetype = Treatment,
group = interaction(Species, Treatment)),
size = 1
) +
geom_point(
data = summary_data,
aes(x = Time, y = mean_cells, color = Species, shape = Treatment),
size = 3
) +
geom_errorbar(
data = summary_data,
aes(x = Time,
ymin = mean_cells - se,
ymax = mean_cells + se,
color = Species),
width = 0.2, size = 1.25
) +
geom_segment(
data = summary_data,
aes(y = 53800000,
yend = 38700000,
color = "CYM"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
geom_segment(
data = summary_data,
aes(y = 173333333,
yend = 212000000,
color = "CAU"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
scale_color_manual(name = "Species",
values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
scale_shape_manual(name = "Treatment",
values = c("Control" = 16, # circles filled
"HW" = 17)) +# triangles filled
scale_linetype_manual(name = "Treatment",
values = c("Control" = "solid",
"HW" = "dashed")) +
guides(
shape = guide_legend(override.aes = list(size = 3)),
linetype = guide_legend(override.aes = list(size = 1.5))
) +
labs(
x = "Time",
y = "Cells",
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(ymin12, ymax12)) +
scale_y_continuous(
breaks = pretty(c(ymin12, ymax12), n = 5),
labels = expression(0,5%*%10^7,1%*%10^8,1.5%*%10^8,2%*%10^8,2.5%*%10^8),
expand = expansion(mult = c(0,0))
)
p_c
We summarize species-wise ranges of microbial abundance (cells) and test differences between species as well as temporal changes across T0–T1–T2 within each Species × Treatment. We use Kruskal–Wallis tests with Benjamini–Hochberg–adjusted pairwise Wilcoxon comparisons, and also report species-wise ranges to contextualize effect magnitudes.
ranges <- datos %>%
group_by(Species) %>%
summarise(
min_cells = min(cells, na.rm = TRUE),
max_cells = max(cells, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(min_x1e9 = round(min_cells / 1e9, 1),
max_x1e9 = round(max_cells / 1e9, 1))
knitr::kable(ranges, digits = 2, caption = "Species-wise ranges (×10^9 cells/g)")
| Species | min_cells | max_cells | min_x1e9 | max_x1e9 |
|---|---|---|---|---|
| CAU | 116000000 | 221000000 | 0.1 | 0.2 |
| CYM | 19800000 | 61900000 | 0.0 | 0.1 |
# Overall (pooling all times/treatments)
bt_overall <- suppressWarnings(stats::wilcox.test(cells ~ Species, data = datos, exact = FALSE))
# By time (T0, T1, T2) — useful to show consistency
bt_by_time <- datos %>%
group_by(Time) %>%
summarise(
p = {
d <- cur_data()
suppressWarnings(stats::wilcox.test(d$cells ~ d$Species, exact = FALSE)$p.value)
},
.groups = "drop"
)
knitr::kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by Time")
| Time | p |
|---|---|
| T0 | 0.081 |
| T1 | 0.081 |
| T2 | 0.005 |
bt_overall$p.value
## [1] 0.00003658455
Global change across the three times: Kruskal–Wallis
Pairwise Wilcoxon with BH adjustment (T0–T1, T1–T2, T0–T2)
# Kruskal–Wallis per group
kw_time <- datos %>%
group_by(Species, Treatment) %>%
summarise(
kw_p = {
d <- cur_data()
if (n_distinct(d$Time) >= 2) {
suppressWarnings(kruskal.test(cells ~ Time, data = d)$p.value)
} else NA_real_
},
.groups = "drop"
)
# Pairwise Wilcoxon per group (BH across the 3 pairwise contrasts within each group)
pw_time <- datos %>%
group_by(Species, Treatment) %>%
group_modify(~{
d <- .x
# Need at least 2 time levels
if (n_distinct(d$Time) < 2) return(tibble())
pw <- pairwise.wilcox.test(d$cells, d$Time, p.adjust.method = "BH", exact = FALSE)
# Convert p-value matrix to long
M <- as.data.frame(as.table(pw$p.value))
names(M) <- c("Time1","Time2","p_adj")
M <- M %>% filter(!is.na(p_adj))
# Add direction using medians
meds <- d %>% group_by(Time) %>% summarise(med = median(cells, na.rm = TRUE), .groups="drop")
M <- M %>%
left_join(meds, by = c("Time1" = "Time")) %>% rename(med1 = med) %>%
left_join(meds, by = c("Time2" = "Time")) %>% rename(med2 = med) %>%
mutate(direction = case_when(
med2 > med1 ~ "increase",
med2 < med1 ~ "decrease",
TRUE ~ "no_change"
))
M
}) %>%
ungroup()
knitr::kable(kw_time, digits = 3, caption = "Kruskal–Wallis across T0–T1–T2 by Species × Treatment")
| Species | Treatment | kw_p |
|---|---|---|
| CAU | Control | 0.491 |
| CAU | HW | |
| CYM | Control | 0.193 |
| CYM | HW |
knitr::kable(pw_time, digits = 3, caption = "Pairwise Wilcoxon (BH-adjusted) and median directions")
| Species | Treatment | Time1 | Time2 | p_adj | med1 | med2 | direction |
|---|---|---|---|---|---|---|---|
| CAU | Control | T1 | T0 | 0.663 | 168000000 | 159000000 | decrease |
| CAU | Control | T2 | T0 | 0.663 | 148000000 | 159000000 | increase |
| CAU | Control | T2 | T1 | 0.663 | 148000000 | 168000000 | increase |
| CYM | Control | T1 | T0 | 1.000 | 52400000 | 54000000 | increase |
| CYM | Control | T2 | T0 | 0.286 | 26500000 | 54000000 | increase |
| CYM | Control | T2 | T1 | 0.286 | 26500000 | 52400000 | increase |
library(readr)
library(dplyr)
library(ggplot2)
library(grid)
datos <- read_table("cell_shanon_data_12_05_25.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
shanon = as.numeric(shanon)
)
summary_data <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(
mean_shanon = mean(shanon, na.rm = TRUE),
sd = sd(shanon, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
.groups = "drop"
)
t1_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T1") %>%
select(Species, y = mean_shanon)
t2_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T2") %>%
select(Species, yend = mean_shanon)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")
t12 <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_shanon - t12$se, t12$mean_shanon + t12$se)
ymin12 <- rango12[1] - 0.55 * diff(rango12)
ymax12 <- rango12[2] + 0.25 * diff(rango12)
p_d<- ggplot() +
geom_jitter(
data = datos,
aes(x = Time, y = shanon, color = Species, shape = Treatment),
width = 0.1, alpha = 0.4, size = 1.2
) +
geom_line(
data = summary_data,
aes(x = Time, y = mean_shanon,
color = Species,
linetype = Treatment,
group = interaction(Species, Treatment)),
size = 1
) +
geom_point(
data = summary_data,
aes(x = Time, y = mean_shanon, color = Species, shape = Treatment),
size = 3
) +
geom_errorbar(
data = summary_data,
aes(x = Time,
ymin = mean_shanon - se,
ymax = mean_shanon + se,
color = Species),
width = 0.2, size = 1.25
) +
geom_segment(
data = summary_data,
aes(y = 5.072667,
yend = 5.056000,
color = "CYM"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
geom_segment(
data = summary_data,
aes(y = 5.201667,
yend = 5.180333,
color = "CAU"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
scale_color_manual(name = "Species",
values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
scale_shape_manual(name = "Treatment",
values = c("Control" = 16, # circles filled
"HW" = 17)) +# triangles filled
scale_linetype_manual(name = "Treatment",
values = c("Control" = "solid",
"HW" = "dashed")) +
guides(
shape = guide_legend(override.aes = list(size = 3)),
linetype = guide_legend(override.aes = list(size = 1.5))
) +
labs(
x = "Time",
y = "Shannon diversity index",
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(ymin12, ymax12)) +
scale_y_continuous(
breaks = pretty(c(ymin12, ymax12), n = 5),
expand = expansion(mult = c(0,0))
)
p_d
We summarize the statistical tests used to evaluate microbial diversity (Shannon index) across species, time, and treatments. n T1→T2), and (v) compare Control vs Heatwave at T2 within each species.
## Overall (all times + treatments pooled)
bt_overall <- suppressWarnings(stats::wilcox.test(shanon ~ Species, data = datos, exact = FALSE))
bt_overall_p <- bt_overall$p.value
## By time (T0, T1, T2)
bt_by_time <- datos %>%
group_by(Time) %>%
summarise(
med_CAU = median(shanon[Species == "CAU"], na.rm = TRUE),
med_CYM = median(shanon[Species == "CYM"], na.rm = TRUE),
p = {
d <- cur_data()
suppressWarnings(stats::wilcox.test(d$shanon ~ d$Species, exact = FALSE)$p.value)
},
.groups = "drop"
)
kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by time (Shannon)")
| Time | med_CAU | med_CYM | p |
|---|---|---|---|
| T0 | 5.188 | 4.927 | 0.081 |
| T1 | 5.198 | 5.056 | 0.383 |
| T2 | 5.207 | 5.080 | 0.173 |
bt_overall_p
## [1] 0.01300303
acclim <- datos %>%
group_by(Species) %>%
summarise(
med_T0 = median(shanon[Time == "T0"], na.rm = TRUE),
med_T1 = median(shanon[Time == "T1"], na.rm = TRUE),
p = {
x <- shanon[Time == "T0"]; y <- shanon[Time == "T1"]
if (length(x) > 0 && length(y) > 0)
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
else NA_real_
},
direction = case_when(med_T1 > med_T0 ~ "increase",
med_T1 < med_T0 ~ "decrease",
TRUE ~ "no_change"),
.groups = "drop"
)
kable(acclim, digits = 3, caption = "T0 vs T1 (acclimation) by species")
| Species | med_T0 | med_T1 | p | direction |
|---|---|---|---|---|
| CAU | 5.188 | 5.198 | 0.663 | increase |
| CYM | 4.927 | 5.056 | 0.383 | increase |
datos <- read_table("cell_shanon_data_12_05_25_complete.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
cells = as.numeric(cells)
)
kw_ctrl <- datos %>%
filter(Treatment == "Control") %>%
group_by(Species) %>%
summarise(
kw_p = {
d <- cur_data()
suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
},
.groups = "drop"
)
kable(kw_ctrl, digits = 3, caption = "Kruskal–Wallis (Control)")
| Species | kw_p |
|---|---|
| CAU | 0.733 |
| CYM | 0.432 |
kw_hw <- datos %>%
filter(Treatment == "HW") %>%
group_by(Species) %>%
summarise(
kw_p = {
d <- cur_data()
suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
},
.groups = "drop"
)
kable(kw_hw, digits = 3, caption = "Kruskal–Wallis (Heatwave)")
| Species | kw_p |
|---|---|
| CAU | 0.875 |
| CYM | 0.288 |
library(dplyr)
library(tidyr)
library(knitr)
target_var <- "shanon"
dat <- datos %>%
filter(Species == "CAU", Treatment == "Control", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Caulerpa – Control: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.202 | 0.075 | 0.043 |
| T2 | 3 | 5.195 | 0.047 | 0.027 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.13793, df = 3.353, p-value = 0.8982
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.1453022 0.1593022
## sample estimates:
## mean in group T1 mean in group T2
## 5.201667 5.194667
dat <- datos %>%
filter(Species == "CYM", Treatment == "Control", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Cymodocea – Control: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.073 | 0.165 | 0.095 |
| T2 | 3 | 5.084 | 0.203 | 0.117 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = -0.077316, df = 3.8365, p-value = 0.9422
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.4377554 0.4144221
## sample estimates:
## mean in group T1 mean in group T2
## 5.072667 5.084333
dat <- datos %>%
filter(Species == "CAU", Treatment == "HW", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Caulerpa – HWl: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.202 | 0.075 | 0.043 |
| T2 | 3 | 5.180 | 0.139 | 0.080 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.23467, df = 3.0672, p-value = 0.8293
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.2644285 0.3070952
## sample estimates:
## mean in group T1 mean in group T2
## 5.201667 5.180333
dat <- datos %>%
filter(Species == "CYM", Treatment == "HW", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Cymodocea – HW: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.073 | 0.165 | 0.095 |
| T2 | 3 | 5.056 | 0.106 | 0.061 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.14762, df = 3.4062, p-value = 0.891
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.3195780 0.3529113
## sample estimates:
## mean in group T1 mean in group T2
## 5.072667 5.056000
Datos.Muestras=data.frame(read_excel("TablaOTUs.xlsx"))
Samples.original=Datos.Muestras$ID
DF.0.original=Datos.Muestras[,-c(1,2)]
row.names(DF.0.original)=Samples.original
OTUs=paste("OTU",1:dim(DF.0.original)[2],sep="")
names(DF.0.original)=OTUs
Sizes=rowSums(DF.0.original)
Tipos.original=as.factor(rep(c("CAU_T0","CAU_T1","CAU_T2C", "CAU_T2HW","CYM_T0","CYM_T1", "CYM_T2C","CYM_T2HW"),each=3))
We have quantified the biodiversity of every type of samples CAU_T* or CYM_T* by means of the Shannon index of the sample obtained by merging the 3 samples of that type. To test which pairs of sample types had significantly different Shannon indices, we have used the Hutcheson t-test, as implemented in function Hutcheson_t_test of the R package ecolTest, and adjusted the p-values with the Holm method.
# Shannon index
SH=function(x){
vegan::diversity(x,index = "shannon")
}
p.lnp2=function(p){
if (p==0){return(0)}
else {
return(p*(log(p)^2))
}
}
# Hutcheson variance estimator
Var.SH=function(x){
N=sum(x)
y=sapply(x/sum(x),FUN=p.lnp2)
bit=sapply(x,FUN=function(x){min(x,1)})
S=sum(bit)
vv=(sum(y)-SH(x)^2)/N+(S-1)/(2*N^2)
return(vv)
}
# Standard deviation estimator
sd.SH=function(x){
sqrt(Var.SH(x))
}
Samples=Samples.original
DF.0=DF.0.original
Tipos=Tipos.original
DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]
For each sample, we compute its Shannon index and an estimation of its standard deviation (sd):
Resultados=data.frame(Samples,
round(apply(DF.0,MARGIN=1,SH),4),
round(apply(DF.0,MARGIN=1,sd.SH),4))
row.names(Resultados)=NULL
names(Resultados)=c("Samples","SH","sd")
flextable(Resultados)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Samples | SH | sd |
|---|---|---|
CAU_T0_1 | 5.2297 | 0.0099 |
CAU_T0_2 | 5.0583 | 0.0083 |
CAU_T0_3 | 5.1884 | 0.0108 |
CAU_T1_1 | 5.2781 | 0.0098 |
CAU_T1_2 | 5.1297 | 0.0099 |
CAU_T1_3 | 5.1981 | 0.0104 |
CAU_T2C_1 | 5.1477 | 0.0102 |
CAU_T2C_2 | 5.1980 | 0.0096 |
CAU_T2C_3 | 5.2405 | 0.0109 |
CAU_T2HW_1 | 5.2974 | 0.0072 |
CAU_T2HW_2 | 5.0278 | 0.0085 |
CAU_T2HW_3 | 5.2167 | 0.0078 |
CYM_T0_1 | 4.8018 | 0.0083 |
CYM_T0_2 | 4.9936 | 0.0098 |
CYM_T0_3 | 4.9272 | 0.0081 |
CYM_T1_1 | 4.9171 | 0.0052 |
CYM_T1_2 | 5.0564 | 0.0056 |
CYM_T1_3 | 5.2460 | 0.0065 |
CYM_T2C_1 | 5.3003 | 0.0063 |
CYM_T2C_2 | 5.0557 | 0.0063 |
CYM_T2C_3 | 4.8975 | 0.0066 |
CYM_T2HW_1 | 4.9353 | 0.0067 |
CYM_T2HW_2 | 5.1045 | 0.0064 |
CYM_T2HW_3 | 5.1299 | 0.0066 |
The next graphic depicts, for each sample, its Shannon index \(SH\) and the error bar \(SH\pm 2\cdot sd\).
row.names(Resultados)=Samples
plot(0.1*(1:24),Resultados$SH,
col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2),
pch=c(rep(20,12),rep(18,12)),cex=1,
xlab="",ylab="Shannon index",xaxt='n',
main="All samples")
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
segments(0.1*(1:24),Resultados$SH-2*Resultados$sd,
0.1*(1:24),Resultados$SH+2*Resultados$sd,
col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2),lwd=1.5)
We repeat the process for sample types: for each sample type, we merge all three samples of that type into a single sample. We include the intervals (SH-3·sd,SH+3·sd). The reason for the factor 3 is that there are 36 pairs of samples to compare, and hence we use as factor for sd the 0.951/36 quantile of the standard normal distribution. In this way, disjoint intervals give statistical evidence (at the 5% signification level) that the corresponding two indices are different.
Resultados.agrupados=data.frame(levels(Tipos),
round(apply(DF.0.agrup,MARGIN=1,SH),4),
round(apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)-3*apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)+3*apply(DF.0.agrup,MARGIN=1,sd.SH),4)
)
row.names(Resultados.agrupados)=NULL
names(Resultados.agrupados)=c("Samples","SH","sd","SH-3sd","SH+3sd")
flextable(Resultados.agrupados)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Samples | SH | sd | SH-3sd | SH+3sd |
|---|---|---|---|---|
CAU_T0 | 5.2009 | 0.0056 | 5.1842 | 5.2176 |
CAU_T1 | 5.2607 | 0.0059 | 5.2428 | 5.2785 |
CAU_T2C | 5.2590 | 0.0060 | 5.2410 | 5.2770 |
CAU_T2HW | 5.2457 | 0.0046 | 5.2319 | 5.2594 |
CYM_T0 | 4.9321 | 0.0051 | 4.9168 | 4.9474 |
CYM_T1 | 5.1190 | 0.0034 | 5.1089 | 5.1291 |
CYM_T2C | 5.1226 | 0.0038 | 5.1113 | 5.1338 |
CYM_T2HW | 5.0875 | 0.0039 | 5.0759 | 5.0991 |
plot(0.1*(1:8),Resultados.agrupados$SH,col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="Shannon index",xaxt='n',
main="All sample types")
segments(0.1*(1:8),Resultados.agrupados$SH-3*Resultados.agrupados$sd,
0.1*(1:8),Resultados.agrupados$SH+3*Resultados.agrupados$sd,
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
SH indices of CAU samples are significantly different from (and actually significantly larger than) those of CYM samples.
plot(0.1*(1:4),Resultados.agrupados$SH[1:4],col=c("green","blue","purple","red"),
pch=20,main="CAU sample types",
xlab="",ylab="Shannon index",xaxt='n',
ylim=c(min(Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4]),
max(Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4])))
segments(0.1*(1:4),Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4],
0.1*(1:4),Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4],
col=c("green","blue","purple","red"),lwd=1.5)
legend("topright",col=c("green","blue","purple","red"),
pch=20,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)
Only the SH index of T0 samples seems to be significantly different from the others.
plot(0.1*(1:4),Resultados.agrupados$SH[5:8],col=c("green","blue","purple","red"),
pch=18,main="CYM sample types",
xlab="",ylab="Shannon index",xaxt='n', ylim=c(min(Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8]),max(Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8])))
segments(0.1*(1:4),Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8],
0.1*(1:4),Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8],
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=18,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)
All four SH indices are significantly different, except for T1 and T2C.
We can use Hutcheson t-test to decide whether Shannon indices are significantly different. We perform pairwise comparisons and adjust the p-values using the Holm method. The conclusions are the same.
pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
for (j in (i+1):8){
pvalSH.G[i,j]= Hutcheson_t_test(DF.0.agrup[i,],DF.0.agrup[j,])$p.value
}
}
pvalSH.G=matrix(p.adjust(pvalSH.G,method="holm"),nrow=8)
colnames(pvalSH.G)=levels(Tipos)
row.names(pvalSH.G)=levels(Tipos)
flextable(data.frame(round(pvalSH.G,6)))|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
CAU_T0 | CAU_T1 | CAU_T2C | CAU_T2HW | CYM_T0 | CYM_T1 | CYM_T2C | CYM_T2HW |
|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0 | 0 | 0.000000 | 0 | |
0.950346 | 0.179925 | 0 | 0 | 0.000000 | 0 | ||
0.228577 | 0 | 0 | 0.000000 | 0 | |||
0 | 0 | 0.000000 | 0 | ||||
0 | 0.000000 | 0 | |||||
0.950346 | 0 | ||||||
0 | |||||||
We address the following question: If we set a threshold \(p_0\), we only retain from each sample the OTUs that appear with a relative frequency \(\geqslant p_0\) and calculate the Shannon index (SH) of those samples, how does its longitudinal behavior vary from T0->T1->T2C->T2HW as \(p_0\) increases?
We use the following method to answer this question:
We obtain that, in the CAU samples, at some point between the 0.15% and 0.2% thresholds, and in the CYM samples, at some point between the 0.1% and 0.15% thresholds, the longitudinal behavior of the SH changes for the first time from that observed in the whole samples, shifting from a rise-fall-fall pattern to a rise-fall-rise pattern.
For each type of CAU sample, around 12-13% of OTUs present in at least one sample of this type have relative frequency in all three samples of that type greater than or equal to 0.1%. For CYM sample types, this figure descends to 9-10%.
# threshold
llindar=function(x,l){
if (x>=l){return(1)}
else{return(0)}
}
#Global sample
DF.0=as.matrix(DF.0)
DF.0.prop=DF.0/rowSums(DF.0)
DF.0.agrup=as.matrix(DF.0.agrup)
DF.0.agrup.prop=DF.0.agrup/rowSums(DF.0.agrup)
Original sample sizes and range of proportions within them:
Tamaños=rowSums(DF.0)
Props.min=round(apply(DF.0.prop,MARGIN=1,function(x){min(x[x>0])}),6)
Props.max=apply(DF.0.prop,MARGIN=1,function(x){max(x[x>0])})
Info=data.frame(Tamaños,Props.min,Props.max)
names(Info)=c("Size","Smallest prop.","Largest prop.")
flextable(Info)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Size | Smallest prop. | Largest prop. |
|---|---|---|
32452 | 0.000031 | 0.05805497 |
45161 | 0.000022 | 0.05832466 |
26298 | 0.000038 | 0.05354019 |
31355 | 0.000032 | 0.05243183 |
30592 | 0.000033 | 0.05458944 |
29368 | 0.000034 | 0.07147235 |
30554 | 0.000033 | 0.05194083 |
33119 | 0.000030 | 0.05694616 |
26568 | 0.000038 | 0.06884222 |
62616 | 0.000016 | 0.06188514 |
43726 | 0.000023 | 0.06026163 |
50715 | 0.000020 | 0.05223307 |
57062 | 0.000018 | 0.10017174 |
38684 | 0.000026 | 0.09365629 |
56482 | 0.000018 | 0.09686272 |
145523 | 0.000007 | 0.11633900 |
122687 | 0.000008 | 0.08515980 |
82489 | 0.000012 | 0.08112597 |
91168 | 0.000011 | 0.06932257 |
97111 | 0.000010 | 0.09009278 |
97125 | 0.000010 | 0.12887516 |
88157 | 0.000011 | 0.14189457 |
94560 | 0.000011 | 0.11748096 |
89922 | 0.000011 | 0.11695692 |
Resultados.Agrupados=data.frame(
SH=apply(DF.0.agrup,MARGIN=1,SH),
Varianza=apply(DF.0.agrup,MARGIN=1,Var.SH))
##
plot(0.1*(1:8),Resultados.Agrupados$SH,col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="SH",xaxt='n',
main="Grouped samples")
segments(0.1*(1:8),Resultados.Agrupados$SH-3*sqrt(Resultados.Agrupados$Varianza),
0.1*(1:8),Resultados.Agrupados$SH+3*sqrt(Resultados.Agrupados$Varianza),
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
#
plot(0.1*(1:4),Resultados.Agrupados$SH[1:4],col=c("green","blue","purple","red"),
pch=20,main="Grouped samples: CAU",
xlab="",ylab="SH",xaxt='n',
ylim=c(min(Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4])),
max(Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4]),
0.1*(1:4),Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]),
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=20,legend=c("T0","T1","T2C","T2HW"),cex=0.5)
#
plot(0.1*(1:4),Resultados.Agrupados$SH[5:8],col=c("green","blue","purple","red"),
pch=18,main="Grouped samples: CYM",
xlab="",ylab="SH",xaxt='n', ylim=c(min(Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8])),max(Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8]),
0.1*(1:4),Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]),
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=18,legend=c("T0","T1","T2C","T2HW"),cex=0.5)
Resultados=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]
Resultados=rbind(Resultados,
c(apply(DD.agrup,MARGIN=1,SH),#Shannon
apply(DD.agrup,MARGIN=1,sd.SH),#Variança
apply(DD.agrup,MARGIN=1,SH)-3*apply(DD.agrup,MARGIN=1,sd.SH), #LE de l'IC
apply(DD.agrup,MARGIN=1,SH)+3*apply(DD.agrup,MARGIN=1,sd.SH),#UE de l'IC
i0 #llindar
))
}
Resultados=data.frame(Resultados)
names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
paste("sd",levels(Tipos),sep="."),
paste("LE",levels(Tipos),sep="."),
paste("UE",levels(Tipos),sep="."),
"threshold")
saveRDS(Resultados, file="Resultados.Shannon.Umbrales.RData")
p.valores=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]
pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
for (j in (i+1):8){
pvalSH.G[i,j]= Hutcheson_t_test(DD.agrup[i,],DD.agrup[j,])$p.value
}
}
pvalSH.G=p.adjust(pvalSH.G,method="holm")
p.valores=rbind(p.valores,
c(i0,
pvalSH.G))
}
write.csv(p.valores, file="p.valores.Shannon.Umbrales.csv",row.names=FALSE)
The results are in the file “Resultados.Shannon.Umbrales.RData”. Its variables are (for each type of sample: CAU_T0, CAU_T1 etc.):
SH.type: The Shannon index (SH) of the union of the three samples of that type
sd.type: The estimated standard deviation of the SH of the union of the three samples of that type
LE.type: SH minus 3 times the estimated standard deviation
UE.type: SH plus 3 times the estimated standard deviation
Threshold: The corresponding threshold
We show its first 10 rows (rounded, and splitted into different tables, for horizontal space reasons), corresponding to thresholds from 0.00005 to 0.0005. :
Resultados=readRDS("Resultados.Shannon.Umbrales.RData")
names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
paste("sd",levels(Tipos),sep="."),
paste("LE",levels(Tipos),sep="."),
paste("UE",levels(Tipos),sep="."),
"Threshold")
Resultados.r4=Resultados
Resultados.r4[,1:32]=round(Resultados.r4[,1:32],4)
flextable(Resultados.r4[1:10,c(33,1+8*(0:3),2+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CAU_T0 | sd.CAU_T0 | LE.CAU_T0 | UE.CAU_T0 | SH.CAU_T1 | sd.CAU_T1 | LE.CAU_T1 | UE.CAU_T1 |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.1252 | 0.0053 | 5.1092 | 5.1412 | 5.1997 | 0.0058 | 5.1824 | 5.2170 |
0.00010 | 5.0481 | 0.0051 | 5.0326 | 5.0635 | 5.1092 | 0.0055 | 5.0926 | 5.1257 |
0.00015 | 5.0011 | 0.0050 | 4.9859 | 5.0162 | 5.0479 | 0.0054 | 5.0318 | 5.0641 |
0.00020 | 4.9245 | 0.0049 | 4.9098 | 4.9391 | 4.9787 | 0.0052 | 4.9630 | 4.9944 |
0.00025 | 4.8751 | 0.0048 | 4.8607 | 4.8895 | 4.9337 | 0.0052 | 4.9182 | 4.9491 |
0.00030 | 4.8395 | 0.0047 | 4.8253 | 4.8537 | 4.8735 | 0.0050 | 4.8584 | 4.8887 |
0.00035 | 4.7807 | 0.0046 | 4.7668 | 4.7946 | 4.8427 | 0.0050 | 4.8277 | 4.8577 |
0.00040 | 4.7456 | 0.0046 | 4.7319 | 4.7594 | 4.8042 | 0.0049 | 4.7893 | 4.8190 |
0.00045 | 4.7104 | 0.0045 | 4.6968 | 4.7239 | 4.7588 | 0.0049 | 4.7442 | 4.7733 |
0.00050 | 4.6612 | 0.0044 | 4.6479 | 4.6745 | 4.7291 | 0.0048 | 4.7146 | 4.7435 |
flextable(Resultados.r4[1:10,c(33,3+8*(0:3),4+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CAU_T2C | sd.CAU_T2C | LE.CAU_T2C | UE.CAU_T2C | SH.CAU_T2HW | sd.CAU_T2HW | LE.CAU_T2HW | UE.CAU_T2HW |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.1943 | 0.0058 | 5.1769 | 5.2117 | 5.1539 | 0.0043 | 5.1408 | 5.1669 |
0.00010 | 5.1027 | 0.0056 | 5.0860 | 5.1194 | 5.0714 | 0.0042 | 5.0588 | 5.0839 |
0.00015 | 5.0536 | 0.0055 | 5.0372 | 5.0699 | 5.0186 | 0.0041 | 5.0063 | 5.0309 |
0.00020 | 4.9697 | 0.0053 | 4.9539 | 4.9855 | 4.9529 | 0.0040 | 4.9409 | 4.9649 |
0.00025 | 4.9234 | 0.0052 | 4.9078 | 4.9389 | 4.9069 | 0.0039 | 4.8952 | 4.9187 |
0.00030 | 4.8896 | 0.0051 | 4.8743 | 4.9050 | 4.8537 | 0.0038 | 4.8422 | 4.8653 |
0.00035 | 4.8425 | 0.0050 | 4.8274 | 4.8576 | 4.8159 | 0.0038 | 4.8045 | 4.8274 |
0.00040 | 4.8079 | 0.0050 | 4.7929 | 4.8228 | 4.7660 | 0.0037 | 4.7548 | 4.7772 |
0.00045 | 4.7851 | 0.0050 | 4.7702 | 4.7999 | 4.7365 | 0.0037 | 4.7254 | 4.7476 |
0.00050 | 4.7406 | 0.0049 | 4.7259 | 4.7553 | 4.7040 | 0.0037 | 4.6930 | 4.7150 |
flextable(Resultados.r4[1:10,c(33,5+8*(0:3),6+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CYM_T0 | sd.CYM_T0 | LE.CYM_T0 | UE.CYM_T0 | SH.CYM_T1 | sd.CYM_T1 | LE.CYM_T1 | UE.CYM_T1 |
|---|---|---|---|---|---|---|---|---|
0.00005 | 4.8502 | 0.0049 | 4.8355 | 4.8649 | 5.0004 | 0.0032 | 4.9909 | 5.0100 |
0.00010 | 4.7581 | 0.0047 | 4.7440 | 4.7722 | 4.9137 | 0.0031 | 4.9045 | 4.9229 |
0.00015 | 4.6853 | 0.0046 | 4.6717 | 4.6990 | 4.8593 | 0.0030 | 4.8503 | 4.8684 |
0.00020 | 4.6284 | 0.0045 | 4.6149 | 4.6418 | 4.7987 | 0.0030 | 4.7898 | 4.8075 |
0.00025 | 4.5794 | 0.0044 | 4.5662 | 4.5927 | 4.7475 | 0.0029 | 4.7387 | 4.7562 |
0.00030 | 4.5353 | 0.0044 | 4.5222 | 4.5483 | 4.7035 | 0.0029 | 4.6948 | 4.7121 |
0.00035 | 4.5003 | 0.0043 | 4.4874 | 4.5132 | 4.6659 | 0.0029 | 4.6573 | 4.6745 |
0.00040 | 4.4627 | 0.0043 | 4.4499 | 4.4754 | 4.6107 | 0.0028 | 4.6023 | 4.6191 |
0.00045 | 4.4162 | 0.0042 | 4.4036 | 4.4288 | 4.5693 | 0.0028 | 4.5609 | 4.5776 |
0.00050 | 4.3812 | 0.0042 | 4.3687 | 4.3937 | 4.5383 | 0.0028 | 4.5300 | 4.5466 |
flextable(Resultados.r4[1:10,c(33,7+8*(0:3),8+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CYM_T2C | sd.CYM_T2C | LE.CYM_T2C | UE.CYM_T2C | SH.CYM_T2HW | sd.CYM_T2HW | LE.CYM_T2HW | UE.CYM_T2HW |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.0126 | 0.0036 | 5.0019 | 5.0233 | 4.9765 | 0.0037 | 4.9655 | 4.9875 |
0.00010 | 4.9095 | 0.0034 | 4.8992 | 4.9197 | 4.8922 | 0.0036 | 4.8816 | 4.9029 |
0.00015 | 4.8446 | 0.0033 | 4.8346 | 4.8546 | 4.8148 | 0.0035 | 4.8044 | 4.8252 |
0.00020 | 4.7855 | 0.0033 | 4.7756 | 4.7953 | 4.7597 | 0.0034 | 4.7494 | 4.7699 |
0.00025 | 4.7292 | 0.0032 | 4.7195 | 4.7388 | 4.7049 | 0.0034 | 4.6948 | 4.7150 |
0.00030 | 4.6798 | 0.0032 | 4.6702 | 4.6893 | 4.6562 | 0.0033 | 4.6462 | 4.6661 |
0.00035 | 4.6419 | 0.0032 | 4.6324 | 4.6513 | 4.6095 | 0.0033 | 4.5997 | 4.6193 |
0.00040 | 4.5895 | 0.0031 | 4.5802 | 4.5988 | 4.5639 | 0.0032 | 4.5542 | 4.5736 |
0.00045 | 4.5482 | 0.0031 | 4.5390 | 4.5575 | 4.5322 | 0.0032 | 4.5226 | 4.5418 |
0.00050 | 4.5203 | 0.0030 | 4.5111 | 4.5294 | 4.5011 | 0.0032 | 4.4915 | 4.5106 |
We start by checking several thresholds:
Dibu.global=function(p){
Tall=as.matrix(Resultados[Resultados$Threshold==p,])
plot(0.1*(1:8),Tall[1,1:8],
col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="SH",xaxt='n',main=paste("Threshold",p,sep=" "))
segments(x0=0.1*(1:8),y0=Tall[1,17:24],
x1=0.1*(1:8),y1=Tall[1,25:32],
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
}
Dibu.global(0.0001)
Dibu.global(0.001)
Dibu.global(0.0025)
Dibu.global(0.005)
Dibu.global("0.0075")
Dibu.global(0.01)
Dibu.global(0.025)
As it can be seen, the longitudinal behavior of Shannon indices changes from p0=0.1% (which is similar to the global behaviour) to 0.25%.
Let us see the graphics for thresholds between 0.001 and 0.005:
for (i in 0:8){
Dibu.global(0.001+0.0005*i)
}
For each type of samples, the next graph represents the proportion of OTUs present in at least one sample of this type whose relative frequency in all three samples of that type is greater than or equal to the percentage indicated on the x-axis. The proportions are also given in a table.
n=seq(from=0.001,to=0.003,by=0.0005)
N=length(n)
Result=n
for (i in 1:8){
X1=DF.0.prop[3*(i-1)+1,]
X1=sort(X1[X1!=0])
X2=DF.0.prop[3*(i-1)+2,]
X2=sort(X2[X2!=0])
X3=DF.0.prop[3*(i-1)+3,]
X3=sort(X3[X3!=0])
PP=rep(0,N)
for(j in 1:N){PP[j]=length(X1[X1>=n[j]] & X2[X2>=n[j]] & X3[X3>=n[j]])/length(X1[X1>0] | X2[X2>0] | X3[X3>0])}
plot(100*n,PP,pch=20,type="l",lwd=1.5,xaxp=c(0.1,0.3,N),yaxp=c(0,0.2,20),xlab="%",ylab="Proportion of OTUs whose percentage is higher than ...",main=Tipos[i])
abline(v=0.01*(0:100),lwd=0.5,col="red")
abline(h=0.01*(0:100),lwd=0.5,col="red")
abline(v=0.15,lwd=1,col="blue")
Result=cbind(Result,round(PP,4))
}
Result=data.frame(Result)
names(Result)=c("Threshold",levels(Tipos))
row.names(Result)=NULL
flextable(Result)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | CAU_T0 | CAU_T1 | CAU_T2C | CAU_T2HW | CYM_T0 | CYM_T1 | CYM_T2C | CYM_T2HW |
|---|---|---|---|---|---|---|---|---|
0.0010 | 0.1269 | 0.1283 | 0.1394 | 0.1162 | 0.1027 | 0.0896 | 0.0922 | 0.0901 |
0.0015 | 0.0968 | 0.0929 | 0.1004 | 0.0829 | 0.0759 | 0.0662 | 0.0643 | 0.0683 |
0.0020 | 0.0751 | 0.0765 | 0.0797 | 0.0652 | 0.0610 | 0.0508 | 0.0539 | 0.0550 |
0.0025 | 0.0609 | 0.0650 | 0.0632 | 0.0546 | 0.0521 | 0.0388 | 0.0446 | 0.0459 |
0.0030 | 0.0509 | 0.0526 | 0.0563 | 0.0446 | 0.0417 | 0.0342 | 0.0348 | 0.0363 |
According to the Bray-Curtis distance (correctly applied to compositions), the CAU and CYM samples at T0 are more different than at T1, more similar at T1 than at T2C, and again more similar at T2HW than at T2C (or than at T1 or T0, for that matter). Overall, the global sets of CAU and CYM samples are more different than the starting T0 samples, as if part of the original differences were diluted. Are these differences statistically significant?
Well, we have devised a method to assess the statistical significance of the differences in median distances between two pairs of samples groups, based on a resampling procedure. Using this method, we obtain that these convergences and divergences from T0 to T1, from T1 to T2C, from T2C to T2HW, and from T0 to the global sample, are statistically significant.
DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]
The next table shows, for each level T0, T1, T2C, T2HW, and Global (all samples), the median Bray-Curtis (BC) distances between the compositions of the CAU and CYM samples at those levels.
Times | Average BC-dist. |
|---|---|
T0 | 0.4124 |
T1 | 0.4097 |
T2C | 0.4493 |
T2HW | 0.3766 |
Global | 0.4189 |
The BC-distances decrease from T0 to T1. Then, from T1 to T2C they increase, while from T1 to T2HW they decrease further. Also, from T0 to Global it decreases. Are these “convergences” and “divergences” of samples statistically significant?
We want to perform a contrast with null hypothesis “The CAU_T1 and CYM_T1 samples are as similar as the CAU_T0 and CYM_T0 samples” and alternative hypothesis “The CAU_T1 and CYM_T1 samples are more similar than the CAU_T0 and CYM_T0 samples”.
To solve this contrast, we resort to a resampling process. The broad idea is:
On the one hand, we merge all 3 samples CAU_T0 in a single sample, and all 3 samples CYM_T0 in another single sample, and we repeat 1000 times the following:
In this way, we obtain a vector of 1000 median distances between CAU and CYM for T0
On the other hand, we merge all 6 samples CAU_T0 and CAU_T1 in a single sample, and all 6 samples CYM_T0 and CYM_T1 in another single sample, and we repeat 1000 times the following:
In this way, we obtain a vector of 1000 median distances between CAU and CYM for the union of T0 and T1
The idea is that if the T1 samples are “as similar” as the T0 samples, the median distances between triples of disjoint random samples from the merged T0 samples should be similar to the median distances between triples of disjoint random samples (of the same sizes) from the merged T0+T1 samples. Conversely, if the T1 samples are “more similar” than the T0 samples, the median distances between triples of random samples from the merged T0 samples should tend to be larger than the median distances between triples of random samples (of the same sizes) from the merged T0+T1 samples.
So, we can use as p-value the proportion of pairs (median distance between CAU and CYM for T0, median distance between CAU and CYM for T0+T1) where the first entry is smaller than the second entry. If the T1 samples are “as similar” as the T0 samples, we would expect this to happen half of times, while if the T0 samples are more different than the T1 samples, we would expect this to happen with low frequency.
The problem is that the CYM samples are larger than CAU samples, and it can bias the samples; for instance if we merge CAU_T1 and CYM_T1 samples and take random samples, most of the taxa will come from CYM:
Samples | Sizes |
|---|---|
CAU_T0_1 | 32452 |
CAU_T0_2 | 45161 |
CAU_T0_3 | 26298 |
CAU_T1_1 | 31355 |
CAU_T1_2 | 30592 |
CAU_T1_3 | 29368 |
CYM_T0_1 | 57062 |
CYM_T0_2 | 38684 |
CYM_T0_3 | 56482 |
CYM_T1_1 | 145523 |
CYM_T1_2 | 122687 |
CYM_T1_3 | 82489 |
To solve this drawback, and since we apply BC-distances to proportions, we reescale all samples to size 106 (up to rounding; we need integer frequencies for the simulations) before merging.
DF=round(DF.0.prop*10^6)
DF.prop=DF/rowSums(DF)
DF.agrup=aggregate(DF,by=list(Tipos),FUN=sum)[,-1]
Sizes=rowSums(DF)
The new sample sizes are
Samples | Sizes |
|---|---|
CAU_T0_1 | 1000070 |
CAU_T0_2 | 999907 |
CAU_T0_3 | 999911 |
CAU_T1_1 | 1000070 |
CAU_T1_2 | 1000030 |
CAU_T1_3 | 999912 |
CYM_T0_1 | 1000234 |
CYM_T0_2 | 1000111 |
CYM_T0_3 | 1000043 |
CYM_T1_1 | 1000078 |
CYM_T1_2 | 999876 |
CYM_T1_3 | 999852 |
Let us check first that with these reescaled samples (where, due to rounding, the proportions are not exactly the original ones but very close to them), the behaviour of the distances is the same as the original one:
Times | Average BC-dist. |
|---|---|
T0 | 0.4124 |
T1 | 0.4097 |
T2C | 0.4493 |
T2HW | 0.3766 |
Global | 0.4189 |
No change.
We shall use the following function:
#' II indexes of the CAU grouped samples
#' The first one is the "base" time
#' Example: CAU T0+T1 and CYM T0+T1 vs CAU T0 and CYM T0
#' II=c(1,2)
#' Example: CAU_T0 vs CYM_T0
#' II=c(1)
Simulation=function(II,distancia){
#CAU
XX1=c()
for (i in 1:length(II)){
XX1=c(XX1,rep(OTUs,DF.agrup[II[i],]))
}
XX1=factor(XX1,levels=OTUs)
ind1=1:length(XX1)
#CYM
XX2=c()
for (i in 1:length(II)){
XX2=c(XX2,rep(OTUs,DF.agrup[4+II[i],]))
}
XX2=factor(XX2,levels=OTUs)
ind2=1:length(XX2)
# sizes of base samples
SS=Sizes[c(3*(II[1]-1)+1:3,3*((4+II[1])-1)+1:3)]
ind1.1=sample(ind1,SS[1],replace=FALSE)
ind1.2=sample(ind1[-ind1.1],SS[2],replace=FALSE)
ind1.3=sample(ind1[-c(ind1.1,ind1.2)],SS[3],replace=FALSE)
ind2.1=sample(ind2,SS[4],replace=FALSE)
ind2.2=sample(ind2[-ind2.1],SS[5],replace=FALSE)
ind2.3=sample(ind2[-c(ind2.1,ind1.2)],SS[6],replace=FALSE)
Y1.1=prop.table(table(factor(XX1[ind1.1],levels=OTUs)))
Y1.2=prop.table(table(factor(XX1[ind1.2],levels=OTUs)))
Y1.3=prop.table(table(factor(XX1[ind1.3],levels=OTUs)))
Y2.1=prop.table(table(factor(XX2[ind2.1],levels=OTUs)))
Y2.2=prop.table(table(factor(XX2[ind2.2],levels=OTUs)))
Y2.3=prop.table(table(factor(XX2[ind2.3],levels=OTUs)))
YY=rbind(Y1.1,Y1.2,Y1.3,Y2.1,Y2.2,Y2.3)
BC=c()
for (i in 1:3){
for (j in 4:6){
BC=c(BC,vegan::vegdist(YY[c(i,j),], method=distancia))
}
}
median(BC)
}
For reproducibility, we fix a random seed
#initial_seed=as.integer(Sys.time())
#print(initial_seed)
## [1] 1725986963
#initial_seed%%10000
## [1] 6963
set.seed(6963)
Let us perform the desired contrast for T0 vs T1 and the BC-distance
Sim.T0T1.T0.BC=replicate(1000,Simulation(c(1,2),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))
saveRDS(Sim.T0T1.T0.BC, file="Sim.T0T1.T0.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")
The median BC-distances for the random T0+T1 samples go from 0.3877 to 0.3895, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. So, all median BC-distances for T0 are larger than any median distance for T0+T1. Moreover, recall that the median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124, larger than the largest median BC-distance for the random T0+T1 samples.
For robustness, let us perform the contrast with T1 as base samples
Sim.T0T1.T1.BC=replicate(1000,Simulation(c(2,1),"bray"))
Sim.T1.BC=replicate(1000,Simulation(c(2),"bray"))
saveRDS(Sim.T0T1.T1.BC, file="Sim.T0T1.T1.BC.RData")
saveRDS(Sim.T1.BC, file="Sim.T1.BC.RData")
In this case, the median BC-distances for the T0+T1 samples go from 0.3874 to 0.3894, while the median BC-distances for the T1 samples go from 0.3792 to 0.3807. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.
Now T1 vs T2C, taking T1 as base samples (for the sake of completeness, we repeat the simulation for T1).
Sim.T1T2C.T1.BC=replicate(1000,Simulation(c(2,3),"bray"))
Sim.T1.BC.2=replicate(1000,Simulation(c(2),"bray"))
saveRDS(Sim.T1T2C.T1.BC, file="Sim.T1T2C.T1.BC.RData")
saveRDS(Sim.T1.BC.2, file="Sim.T1.BC.2.RData")
The median BC-distances for the T1+T2C samples go from 0.3971 to 0.3994, while the median BC-distances for the T1 samples go from 0.3792 to 0.3806. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.
And T2C vs T2HW, with T2C as base samples.
Sim.T2CT2HW.T2C.BC=replicate(1000,Simulation(c(3,4),"bray"))
Sim.T2C.BC=replicate(1000,Simulation(c(3),"bray"))
saveRDS(Sim.T2CT2HW.T2C.BC,file="Sim.T2CT2HW.T2C.BC.RData")
saveRDS(Sim.T2C.BC, file="Sim.T2C.BC.RData")
The median BC-distances for the T2C+T2HW samples go from 0.385 to 0.3869, while the median BC-distances for the T2C samples go from 0.4274 to 0.4288. The median BC-distance between the (reescaled) CAU_T2C and CYM_T2C samples was 0.4493
Finally, the global samples:
Sim.Global.BC=replicate(1000,Simulation(c(1,2,3,4),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))
saveRDS(Sim.Global.BC, file="Sim.Global.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")
The median BC-distances bewteen CAU and CYM for the Global samples go from 0.3805 to 0.3828, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. The median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124
For each pair of sample types considered, we proceed as follows:
We load TablaOTUs.xlsx and retain only the relevant “global” samples (all, only CAU, or only CYM).
We remove OTUs with 2 or fewer reads across the selected samples.
We remove OTUs that appear in only one of the selected samples, in order to apply Bayesian-multiplicative zero imputation.
We perform Bayesian-multiplicative zero imputation on the selected samples.
We retain only the samples corresponding to the time points to be compared.
We assess whether and how well the two types of samples are clearly separated in a hierarchical clustering.
We study how well the two types of samples separate when only a random group of OTUs of fixed size is considered, for different sizes. This provides a reference against which we evaluate the significance of the separation observed with specific groups of OTUs.
To evaluate how a group of taxa separates two groups of samples CAUx and CYMx, we use the index
\[
\frac{\text{distance between CAU${}_x$ and CYM${}_x$}}{\sqrt{(\text{dist. within CAU${}_x$})^2 + (\text{dist. within CYM${}_x$})^2}}
\]
where the distances are calculated as the heights in the hierarchical tree obtained by clustering the samples with the selected taxa.
In each one of these simulations, we fix the random seed as the last four digits of Sys.time() at the moment of execution.
We search for groups of OTUs with significantly different abundances in the two sample types by applying several methods: significance according to ALDeX (adjusted p-values from the Kruskal–Wallis test); relevant log-ratios; bacterial signatures identified with coda_glmnet; and significance according to simper.
When the different methods identify groups of OTUs that consistently separate the two sample types, we record them in a table Significant, and finally we export this table as a csv file. Its name is “OTU_Significativos_” followed by the pair of sample types.
tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID
n.OTUs.original=dim(tax_otu)[2]-2
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
DF.0=tax_otu[,-c(1,2)]
taxa.original=taxa
OTUs=paste("OTU",1:n.OTUs.original,sep="")
colnames(DF.0)=OTUs
DF.0.total=DF.0
colnames(DF.0.total)=OTUs
rownames(DF.0.total)=Samples
rm(tax_otu)
Type=as.factor(rep(c("CAU","CYM"),each=12))
colors=c("green","blue")[Type]
#' Green: CAU
#' Blue: CYM
The OTUs that appear with 2 or fewer hits in the total sample represent 15.8% of the total.
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
# DF.0.hits: para cada OTU y cada muestra, 1 si OTU in Muestra y 0 si no
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
OTUs that appear in only one sample but represent more than 0.05% of that sample:
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012183 |
We remove the OTUs that appear in only one samples and we perform Bayesian-multiplicative zero imputation.
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples
rm(DF.0.hits)
rm(Miseria)
From 3088 initial taxa, we have reduced to 2542.
HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)
separacio.global=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 2.47.
We observe below that the two sample types are so different that almost any subset of OTUs of reasonable size classifies them correctly. This will serve as a reference to compare how much improvement we gain with the “good” sets identified later.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711395644
initial_seed %% 10000
# [1] 5644
set.seed(5644)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:round((dim(DF)[2])/2))
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAUvsCYM.RData")
F.SS=readRDS("Fraccio.Separadors.CAUvsCYM.RData")
The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU and CYM (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We have used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU and CYM (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). This plot will serve as a reference later.
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
`
We always use the aldexCesc.clr version of the aldex.clr function (from the script funcionsCODAMETACIRCLE.R) to generate the simulated samples.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711411362
initial_seed %% 10000
# [1] 1362
set.seed(1362)
repl.kw.Cescv2.CaCy=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.CaCy=aldex.kw(repl.kw.Cescv2.CaCy, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.CaCy, file="aldex_kw_Cescv2_CaCy.RData")
p.valores_kw=readRDS("aldex_kw_Cescv2_CaCy.RData")
Adjusted Kruskal–Wallis p-value < 0.05
sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1)
kw.005=which(p.valores_kw[,2]<0.05)
There are 804 taxa with adjusted Kruskal–Wallis p-value < 0.05.
The separation between groups is 2.48.
Adjusted Kruskal–Wallis p-value < 0.01
sep.kw.001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.01,1)
kw.001=which(p.valores_kw[,2]<0.01)
There are 605 taxa with adjusted Kruskal–Wallis p-value < 0.01.
The separation between groups is 2.35.
Adjusted Kruskal–Wallis p-value < 0.001
sep.kw.0001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.001,1)
kw.0001=which(p.valores_kw[,2]<0.001)
There are 408 taxa with adjusted Kruskal–Wallis p-value < 0.001.
The separation between groups is 2.89.
Indicador=function(x,k=n.OTUs.original){
xx=rep(0,k)
y=as.numeric(gsub("\\D+", "", OTUs[x]))
xx[y]=1
return(xx)
}
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Aldex.kw.0.05=Indicador(kw.005),
Aldex.kw.0.01=Indicador(kw.001),
Aldex.kw.0.001=Indicador(kw.0001)
)
We plot the assock values (see the main text) and obtain its maximum.
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAUCYM.RData")
LRS=readRDS("LRS_CAUCYM.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAUCYM.RData")
assoc=readRDS("assoc_CAUCYM.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most logratio-relevant taxa."
,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 103 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 6.58.
Significant=cbind(Significant,
Log_ratio.max=Indicador(impLR))
coda_glmnet)We always apply the coda_glmnet function to the frequency matrix with zeros already imputed.
coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.CAUCYM.RData")
coda.glmnet=readRDS("coda.glmnet.CAUCYM.RData")
coda.glmnet$`signature plot`
impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
SB.glmnet=c(length(coda.glmnet$taxa.num),Sep(HC))
There are 118 significant taxa.
The separation between groups is 4.55.
Significant=cbind(Significant,
glmnet_sign.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))
We always use simper from vegan on the (original) proportion matrix.
DF.prop=DF.0.total/rowSums(DF.0.total)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.rel.prop=SMP.prop$CAU_CYM$ord[which(SMP.prop$CAU_CYM$cusum<=0.7)]
OTUs.simper.rel.prop=sort(attr(SMP.prop$CAU_CYM$cusum[which(SMP.prop$CAU_CYM$cusum<=0.7)],"names"))
DF.Imp=DF[,OTUs.simper.rel.prop]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
simper.rel.props=c(length(indexos_SMP.rel.prop),Sep(HC))
There are 93 significant taxa. They perfectly separate the two sample types. The separation between groups is 2.88.
Indicador.Gl=function(x,k=n.OTUs.original){
xx=rep(0,k)
xx[x]=1
return(xx)
}
Significant=cbind(Significant,
simper.props=Indicador.Gl(indexos_SMP.rel.prop))
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAUvsCYM_def.csv",row.names=FALSE)
Resultados=rbind(
sep.kw.0001,sep.kw.005,sep.kw.001,
LR.max,SB.glmnet,
simper.rel.props
)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.
Significant.capat=Significant[,c(1,2,6,7,8)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
There are 41 such taxa. They are
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU22 | OTU22 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae |
| OTU30 | OTU30 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium |
| OTU32 | OTU32 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured. |
| OTU35 | OTU35 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta |
| OTU38 | OTU38 | Unassigned.__.....__ |
| OTU39 | OTU39 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes |
| OTU42 | OTU42 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured. |
| OTU45 | OTU45 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.. |
| OTU46 | OTU46 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Ignavibacteriaceae.gIgnavibacterium. |
| OTU70 | OTU70 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales... |
| OTU109 | OTU109 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gLCP.80. |
| OTU133 | OTU133 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans. |
| OTU141 | OTU141 | d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__0319.6G20.f__0319.6G20.g0319.6G20. |
| OTU213 | OTU213 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea. |
| OTU218 | OTU218 | d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_organism |
| OTU223 | OTU223 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__ST.12K33.g__ST.12K33.s__uncultured_Cytophagales |
| OTU229 | OTU229 | d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_bacterium |
| OTU230 | OTU230 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes |
| OTU253 | OTU253 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta |
| OTU255 | OTU255 | d__Bacteria.p__Spirochaetota.c__V2072.189E03.o__V2072.189E03.f__V2072.189E03.g__V2072.189E03.s__uncultured_organism |
| OTU256 | OTU256 | d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_bacterium |
| OTU269 | OTU269 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gCandidatus_Thiobios. |
| OTU451 | OTU451 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gActibacter. |
| OTU498 | OTU498 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_organism |
| OTU527 | OTU527 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfotignum.s__uncultured_bacterium |
| OTU761 | OTU761 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group. |
| OTU811 | OTU811 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_bacterium |
| OTU881 | OTU881 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_gamma |
| OTU890 | OTU890 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_Bacteroidetes |
| OTU1107 | OTU1107 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__metagenome |
| OTU1329 | OTU1329 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema. |
| OTU1390 | OTU1390 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__saltmarsh_clone |
| OTU1419 | OTU1419 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_bacterium |
| OTU1443 | OTU1443 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.g__B2M28.s__uncultured_proteobacterium |
| OTU1461 | OTU1461 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__Robiginitalea_myxolifaciens |
| OTU1532 | OTU1532 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_organism |
| OTU1594 | OTU1594 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.gPHOS.HE36. |
| OTU1701 | OTU1701 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Methyloligellaceae.gMethyloceanibacter. |
| OTU1839 | OTU1839 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Thioalkalispiraceae.g__Thioalkalispira.Sulfurivermis.s__uncultured_Thioalkalispiraceae |
| OTU2338 | OTU2338 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.gXenococcus_PCC.7305. |
| OTU2371 | OTU2371 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.gRoseburia. |
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
The separation between groups is 7.86.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper","intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| Aldex.0.001 | 408 | 2.889614 |
| Aldex.0.05 | 804 | 2.481799 |
| Aldex.0.01 | 605 | 2.354071 |
| LR | 103 | 6.579296 |
| glmnet | 118 | 4.550477 |
| simper | 93 | 2.881239 |
| intersection | 41 | 7.863271 |
tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[c(1:3,13:15),-c(1,2)]
row.names(DF.0)=Samples[c(1:3,13:15)]
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples[c(1:3,13:15)]
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples[c(1:3,13:15)]
Type=as.factor(c(rep("CAU_T0", 3),rep("CYM_T0", 3)))
colors=c("green","blue")[Type]
After removing OTUs with 2 or fewer reads across the T0 samples and those appearing in only one of these samples, 1472 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012208 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)
separacio.global=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 2.43.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717567904
initial_seed %% 10000
# [1] 7904
set.seed(7904)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:750)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","proporción","sep. media","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.T0.RData")
F.SS=readRDS("Fraccio.Separadors.T0.RData")
The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CYM_T0 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CYM_T0 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
`
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717574148
initial_seed %% 10000
# [1] 4148
set.seed(4148)
repl.kw.Cescv2.T0=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.T0=aldex.kw(repl.kw.Cescv2.T0, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.T0, file="aldex_kw_Cescv2.T0.RData")
Taxa with statistically significant differences:
p.valores_kw=readRDS("aldex_kw_Cescv2.T0.RData")
length(p.valores_kw[p.valores_kw[,2]<0.05,2]) #K-W adjusted p-value < 0.05
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,2]) #p-value < 0.05
## [1] 121
length(p.valores_kw[p.valores_kw[,1]<0.01,4]) #p-value < 0.01
## [1] 0
There is no OTU with an adjusted Kruskal–Wallis test p-value < 0.05. There are 121 with an unadjusted p-value < 0.05 (and none < 0.01). They classify the samples correctly:
sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)
kw.005=which(p.valores_kw[,2]<0.001)
The separation between groups is 3.43.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Aldex.kw.0.05=Indicador(kw.005)
)
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_T0.RData")
LRS=readRDS("LRS_T0.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_T0.RData")
assoc=readRDS("assoc_T0.RData")
plot(5:200,assoc[1:196],type="b",xlab="m",ylab="Bondad mediana de los m más relevantes"
,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 549 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.83.
Note In the plot we can observe a very pronounced local maximum at 46. Let us examine it:
m0=4+which.max(assoc[1:100])
impLR.2=sort(as.numeric(LRS$`order of importance`[1:m0]))
DF.Imp.0=DF[,impLR.2]
HC=ClusterHC(DF.Imp.0,dendrograma=TRUE,barplot=FALSE, Grups=Type)
LR.max.2=c(length(impLR.2),Sep(HC))
They also perfectly separate the two sample types. The separation between groups is 4.66. Fewer taxa, but better separation.
Significant=cbind(Significant,
Log_ratio.max=Indicador(impLR),
Log_ratio.max2=Indicador(impLR.2))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.T0.RData")
coda.glmnet=readRDS("coda.glmnet.T0.RData")
coda.glmnet$`signature plot`
impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.no0=c(length(coda.glmnet$taxa.num),Sep(HC))
There are 350 significant taxa.
The separation between groups is 2.78.
Significant=cbind(Significant,
glmnet_sign.no0.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.no0.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))
Out of curiosity, we use 3 replicates to see what happens.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717687628
initial_seed %% 10000
# [1] 7628
M=3
set.seed(7628)
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(
repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,
repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3
))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(
paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_")
)
mostres_ext=rbind(repls,easyCODA::CLR(DF)$LR)
conds_ext=as.factor(c(rep("CAU", 3*M),rep("CYM", 3*M),rep("CAU", 3),rep("CYM", 3)))
colors_ext=c("green","blue")[conds_ext]
rm(repls)
LogRat_mostres_ext_T0=logratios_matrix_clr(mostres_ext)
saveRDS(LogRat_mostres_ext_T0, file="LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=readRDS("LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=list(LogRat_mostres_ext_T0[[1]],LogRat_mostres_ext_T0[[2]])
coda.glmnet_ext=coda_glmnet_lr_bin(LogRat_mostres_ext_T0,conds_ext,taxa,showPlots = FALSE)
coda.glmnet_ext$`signature plot`
impGLMNET_ext=coda.glmnet_ext$taxa.num
DF.Imp=DF[,impGLMNET_ext]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext=c(length(impGLMNET_ext),Sep(HC))
There are 104 significant taxa.
The separation between groups is 4.42.
Significant=cbind(Significant,
glmnet_sign.ext.CAU=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`>0]),
glmnet_sign.ext.CYM=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`<0]))
DF.prop=DF.0.total[c(1:3,13:15),]/rowSums(DF.0.total[c(1:3,13:15),])
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]
length(indexos_SMP.07)
## [1] 0
length(indexos_SMP.05)
## [1] 0
We find no significant taxon.
write.csv2(Significant,"OTU_Significativos_CAUT0vsCYMT0.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
LR.max.2,
sep.kw.005,
glmnet.no0,
glmnet.ext)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext"
)
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),ylim=c(0,7),yaxp=c(0,7,28))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
text(20,separacio.global+0.01,col="green",labels="Global",cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.
Significant.capat=Significant[,c(1,2,4,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 312 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| 1 | OTU1 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.guncultured. |
| 2 | OTU2 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Actibacter.s__uncultured_Bacteroidetes |
| 3 | OTU3 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__uncultured.s__uncultured_delta |
| 4 | OTU4 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__bacterium_AMSU |
| 7 | OTU7 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.... |
| 8 | OTU8 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__uncultured.s__uncultured_organism |
| 9 | OTU9 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSva0081_sediment_group. |
| 10 | OTU10 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.guncultured. |
| 12 | OTU12 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Bacteroidetes_BD2.2.gBacteroidetes_BD2.2. |
| 13 | OTU13 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.guncultured. |
| 15 | OTU15 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridia.f__Hungateiclostridiaceae.g__uncultured.s__uncultured_bacterium |
| 16 | OTU16 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.. |
| 18 | OTU18 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_sediment |
| 19 | OTU19 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.guncultured. |
| 20 | OTU20 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_bacterium |
| 22 | OTU22 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae |
| 23 | OTU23 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.gLentimicrobiaceae. |
| 24 | OTU24 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.oBacteroidales... |
| 25 | OTU25 | d__Archaea.p__Asgardarchaeota.c__Heimdallarchaeia.o__Heimdallarchaeia.f__Heimdallarchaeia.g__Heimdallarchaeia.s__uncultured_archaeon |
| 26 | OTU26 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__uncultured_organism |
| 28 | OTU28 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_sediment |
| 29 | OTU29 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Milano.WF1B.44.f__Milano.WF1B.44.g__Milano.WF1B.44.s__uncultured_gamma |
| 30 | OTU30 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium |
| 31 | OTU31 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oEctothiorhodospirales... |
| 32 | OTU32 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured. |
| 33 | OTU33 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidetes_VC2.1_Bac22.f__Bacteroidetes_VC2.1_Bac22.g__Bacteroidetes_VC2.1_Bac22.s__uncultured_Bacteroidetes |
| 35 | OTU35 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta |
| 36 | OTU36 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.gWoeseia. |
| 37 | OTU37 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__KI89A_clade.f__KI89A_clade.gKI89A_clade. |
| 38 | OTU38 | Unassigned.__.....__ |
| 39 | OTU39 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes |
| 40 | OTU40 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Pelolinea.s__uncultured_bacterium |
| 41 | OTU41 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.gB2M28. |
| 42 | OTU42 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured. |
| 43 | OTU43 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thioalkalivibrio |
| 44 | OTU44 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSEEP.SRB1. |
| 45 | OTU45 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.. |
| 47 | OTU47 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__SCGC_AAA011.D5.g__SCGC_AAA011.D5.s__uncultured_euryarchaeote |
| 49 | OTU49 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.g__Candidatus_Thiodiazotropha.s__Candidatus_Thiodiazotropha |
| 50 | OTU50 | d__Bacteria.p__Sva0485.c__Sva0485.o__Sva0485.f__Sva0485.gSva0485. |
| 52 | OTU52 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.guncultured. |
| 53 | OTU53 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.guncultured. |
| 54 | OTU54 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Ilumatobacteraceae.gIlumatobacter. |
| 55 | OTU55 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_bacterium |
| 56 | OTU56 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Arenicellales.f__Arenicellaceae.guncultured. |
| 57 | OTU57 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gEudoraea. |
| 60 | OTU60 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_bacterium |
| 62 | OTU62 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.. |
| 64 | OTU64 | d__Eukaryota...... |
| 69 | OTU69 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.. |
| 70 | OTU70 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales... |
| 71 | OTU71 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gRobiginitalea. |
| 72 | OTU72 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calorithrix.s__uncultured_bacterium |
| 73 | OTU73 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__BSV26.gBSV26. |
| 77 | OTU77 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thermoplasmatales |
| 79 | OTU79 | d__Bacteria.p__Acidobacteriota.c__Subgroup_21.o__Subgroup_21.f__Subgroup_21.g__Subgroup_21.s__uncultured_bacterium |
| 80 | OTU80 | d__Bacteria...... |
| 81 | OTU81 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_bacterium |
| 82 | OTU82 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_delta |
| 83 | OTU83 | d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..gSAR324_clade.Marine_group_B.. |
| 84 | OTU84 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_bacterium |
| 85 | OTU85 | d__Bacteria.pChloroflexi..... |
| 88 | OTU88 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSpirochaeta_2. |
| 94 | OTU94 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta.s__uncultured_bacterium |
| 95 | OTU95 | d__Bacteria.p__Cloacimonadota.c__Cloacimonadia.o__Cloacimonadales.f__MSBL8.g__MSBL8.s__uncultured_bacterium |
| 98 | OTU98 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.gAcetothermiia. |
| 102 | OTU102 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_Chlorobi |
| 104 | OTU104 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.. |
| 105 | OTU105 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gHalochromatium. |
| 108 | OTU108 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gGWE2.31.10. |
| 111 | OTU111 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Ardenticatenales.f__uncultured.guncultured. |
| 113 | OTU113 | d__Archaea.p__Altiarchaeota.c__Altiarchaeia.o__Altiarchaeales.f__Altiarchaeaceae.g__Candidatus_Altiarchaeum.s__uncultured_archaeon |
| 116 | OTU116 | d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.g__Zixibacteria.s__uncultured_bacterium |
| 118 | OTU118 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__uncultured_archaeon |
| 119 | OTU119 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__uncultured.s__uncultured_Bacteroidetes |
| 120 | OTU120 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.gDesulfobulbus. |
| 122 | OTU122 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.guncultured. |
| 123 | OTU123 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gPir4_lineage. |
| 124 | OTU124 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_gamma |
| 130 | OTU130 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_III.gClade_III. |
| 132 | OTU132 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__BD7.8.f__BD7.8.gBD7.8. |
| 133 | OTU133 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans. |
| 135 | OTU135 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_I.gClade_Ia. |
| 136 | OTU136 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_bacterium |
| 139 | OTU139 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__Micavibrionaceae.g__uncultured.s__uncultured_bacterium |
| 140 | OTU140 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Cyanobacteriaceae.. |
| 144 | OTU144 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gAquibacter. |
| 146 | OTU146 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_bacterium |
| 147 | OTU147 | d__Bacteria.p__Nitrospirota.c__Thermodesulfovibrionia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Nitrospirae |
| 148 | OTU148 | d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.gR76.B128. |
| 149 | OTU149 | d__Bacteria.p__Acidobacteriota.c__Subgroup_18.o__Subgroup_18.f__Subgroup_18.gSubgroup_18. |
| 150 | OTU150 | d__Archaea...... |
| 152 | OTU152 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Pedosphaerales.f__Pedosphaeraceae.g__DEV008.s__uncultured_bacterium |
| 153 | OTU153 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_archaeon |
| 155 | OTU155 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_sediment |
| 156 | OTU156 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_bacterium |
| 157 | OTU157 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__SEEP.SRB1.s__Olavius_ilvae |
| 158 | OTU158 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_bacterium |
| 159 | OTU159 | d__Bacteria.pAcidobacteriota..... |
| 160 | OTU160 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Alphaproteobacteria |
| 161 | OTU161 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Caldilineales.f__Caldilineaceae.guncultured. |
| 162 | OTU162 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__uncultured_actinobacterium |
| 168 | OTU168 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thiomicrospiraceae.gendosymbionts. |
| 169 | OTU169 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.guncultured. |
| 173 | OTU173 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_Deferribacteres |
| 175 | OTU175 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae.. |
| 181 | OTU181 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.... |
| 185 | OTU185 | d__Bacteria.p__Patescibacteria.c__WWE3.o__WWE3.f__WWE3.g__WWE3.s__uncultured_bacterium |
| 189 | OTU189 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Defluviitaleaceae.g__Defluviitaleaceae_UCG.011.s__uncultured_bacterium |
| 190 | OTU190 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.g__uncultured.s__bacterium_enrichment |
| 191 | OTU191 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__Woesearchaeales.g__Woesearchaeales.s__uncultured_crenarchaeote |
| 192 | OTU192 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRhodopirellula. |
| 194 | OTU194 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__Spirochaeta_isovalerica |
| 204 | OTU204 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__metagenome |
| 205 | OTU205 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSediminispirochaeta. |
| 208 | OTU208 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gLutibacter. |
| 211 | OTU211 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_marine |
| 213 | OTU213 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea. |
| 216 | OTU216 | d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_bacterium |
| 217 | OTU217 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__Desulfobulbus.s__uncultured_bacterium |
| 222 | OTU222 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Prolixibacteraceae.gSunxiuqinia. |
| 226 | OTU226 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__KZNMV.5.B42.f__KZNMV.5.B42.gKZNMV.5.B42. |
| 228 | OTU228 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__Oligosphaerales.f__Lenti.02.g__Lenti.02.s__uncultured_bacterium |
| 230 | OTU230 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes |
| 231 | OTU231 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.. |
| 232 | OTU232 | d__Bacteria.p__Chloroflexi.c__KD4.96.o__KD4.96.f__KD4.96.gKD4.96. |
| 236 | OTU236 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.gMSB.3C8. |
| 237 | OTU237 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__uncultured.s__uncultured_marine |
| 239 | OTU239 | d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.gZixibacteria. |
| 242 | OTU242 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__Unknown_Family.s__uncultured_bacterium |
| 249 | OTU249 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.guncultured. |
| 252 | OTU252 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.gDEV007. |
| 253 | OTU253 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta |
| 260 | OTU260 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_organism |
| 266 | OTU266 | d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Kerfeldbacteria.f__Candidatus_Kerfeldbacteria.g__Candidatus_Kerfeldbacteria.s__uncultured_bacterium |
| 267 | OTU267 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae.guncultured. |
| 274 | OTU274 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Thermomarinilinea.s__uncultured_bacterium |
| 276 | OTU276 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.... |
| 277 | OTU277 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Kiloniellales.f__Kiloniellaceae.gTistlia. |
| 284 | OTU284 | d__Bacteria.p__LCP.89.c__LCP.89.o__LCP.89.f__LCP.89.g__LCP.89.s__saltmarsh_clone |
| 292 | OTU292 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmatota.o__Thermoplasmatota.f__Thermoplasmatota.g__uncultured.s__uncultured_archaeon |
| 293 | OTU293 | d__Archaea.p__Aenigmarchaeota.c__Deep_Sea_Euryarchaeotic_Group.DSEG..o__Deep_Sea_Euryarchaeotic_Group.DSEG..f__Deep_Sea_Euryarchaeotic_Group.DSEG..g__Deep_Sea_Euryarchaeotic_Group.DSEG..s__uncultured_archaeon |
| 300 | OTU300 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_Spirochaetes |
| 304 | OTU304 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gMaritimimonas. |
| 305 | OTU305 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_organism |
| 307 | OTU307 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.oWoesearchaeales... |
| 314 | OTU314 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__UBA10353_marine_group.f__UBA10353_marine_group.g__UBA10353_marine_group.s__uncultured_bacterium |
| 318 | OTU318 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.g__Filomicrobium.s__uncultured_Alphaproteobacteria |
| 328 | OTU328 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatirhabdium.s__uncultured_delta |
| 334 | OTU334 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__MSBL9.f__SG8.4.gSG8.4. |
| 337 | OTU337 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SJA.15.f__SJA.15.g__SJA.15.s__uncultured_Chloroflexi |
| 340 | OTU340 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Absconditabacteriales_.SR1..f_Absconditabacteriales.SR1..g_Absconditabacteriales.SR1..s__uncultured_bacterium |
| 341 | OTU341 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Crocinitomicaceae.g__Crocinitomix.s__uncultured_bacterium |
| 353 | OTU353 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Nitrincolaceae.guncultured. |
| 354 | OTU354 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_Spirochaetes |
| 356 | OTU356 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_bacterium |
| 358 | OTU358 | d__Bacteria.p__Elusimicrobiota.c__Lineage_IIb.o__Lineage_IIb.f__Lineage_IIb.g__Lineage_IIb.s__uncultured_bacterium |
| 368 | OTU368 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gHIMB11. |
| 371 | OTU371 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Maritimimonas.s__uncultured_organism |
| 372 | OTU372 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Pseudohongiellaceae.gPseudohongiella. |
| 381 | OTU381 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Candidatus_Peregrinibacteria.f__Candidatus_Peregrinibacteria.g__Candidatus_Peregrinibacteria.s__uncultured_bacterium |
| 386 | OTU386 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Vibrionales.f__Vibrionaceae.gVibrio. |
| 389 | OTU389 | d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.g__Peredibacter.s__uncultured_bacterium |
| 390 | OTU390 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_organism |
| 392 | OTU392 | d__Archaea.p__Asgardarchaeota.c__Odinarchaeia.o__Odinarchaeia.f__Odinarchaeia.g__Odinarchaeia.s__uncultured_marine |
| 393 | OTU393 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_actinobacterium |
| 394 | OTU394 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__uncultured_marine |
| 397 | OTU397 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.gNB1.j. |
| 400 | OTU400 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.gCCM11a. |
| 401 | OTU401 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__Labilibacter.s__uncultured_Bacteroidetes |
| 402 | OTU402 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_organism |
| 421 | OTU421 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.guncultured. |
| 424 | OTU424 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.gPAUC43f_marine_benthic_group. |
| 426 | OTU426 | d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophaceae.g__Candidatus_Omnitrophus.s__uncultured_bacterium |
| 435 | OTU435 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.gDesulfobacter. |
| 436 | OTU436 | d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__Bradymonadales.f__Bradymonadales.gBradymonadales. |
| 448 | OTU448 | d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Falkowbacteria.f__Candidatus_Falkowbacteria.gCandidatus_Falkowbacteria. |
| 453 | OTU453 | d__Bacteria.p__Modulibacteria.c__Moduliflexia.o__Moduliflexales.f__Moduliflexaceae.gModuliflexaceae. |
| 463 | OTU463 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__RBG.13.54.9.f__RBG.13.54.9.g__RBG.13.54.9.s__uncultured_bacterium |
| 475 | OTU475 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__SCGC.AB.539.J10.s__uncultured_bacterium |
| 476 | OTU476 | d__Bacteria.p__Myxococcota.c__Polyangia.o__MidBa8.f__MidBa8.g__MidBa8.s__uncultured_delta |
| 480 | OTU480 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_organism |
| 482 | OTU482 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Chloroplast.f__Chloroplast.g__Chloroplast.s__Chromerida_sp. |
| 483 | OTU483 | d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__Oligoflexales.f__uncultured.g__uncultured.s__uncultured_organism |
| 490 | OTU490 | d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.gPeredibacter. |
| 495 | OTU495 | d__Archaea.p__Micrarchaeota.c__Micrarchaeia.o__Micrarchaeales.f__Micrarchaeales.g__Micrarchaeales.s__uncultured_euryarchaeote |
| 501 | OTU501 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Bacteroidetes |
| 503 | OTU503 | d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__Chitinivibrionales.f__Chitinivibrionaceae.g__possible_genus_03.s__uncultured_bacterium |
| 508 | OTU508 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.g__DEV007.s__uncultured_organism |
| 509 | OTU509 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Rhodospirillaceae.g__Candidatus_Riegeria.s__uncultured_bacterium |
| 530 | OTU530 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thioglobaceae.gSUP05_cluster. |
| 532 | OTU532 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__bacterium_enrichment |
| 537 | OTU537 | d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_sediment |
| 539 | OTU539 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.gCandidatus_Kaiserbacteria. |
| 541 | OTU541 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ga0077536.f__Ga0077536.g__Ga0077536.s__uncultured_gamma |
| 547 | OTU547 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Rhizobiales_Incertae_Sedis.gAnderseniella. |
| 553 | OTU553 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Helicobacteraceae.. |
| 554 | OTU554 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__AB.539.J10.s__uncultured_bacterium |
| 565 | OTU565 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Portibacter.s__uncultured_Bacteroidetes |
| 568 | OTU568 | d__Bacteria.p__Margulisbacteria.c__Margulisbacteria.o__Margulisbacteria.f__Margulisbacteria.g__Margulisbacteria.s__uncultured_bacterium |
| 570 | OTU570 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Thermoflexales.f__Thermoflexaceae.gThermoflexus. |
| 574 | OTU574 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__uncultured.s__uncultured_organism |
| 585 | OTU585 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.g__MSB.3C8.s__uncultured_bacterium |
| 618 | OTU618 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Ectothiorhodospiraceae.g__Thiogranum.s__uncultured_gamma |
| 620 | OTU620 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.g__uncultured.s__Clostridiales_bacterium |
| 622 | OTU622 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_Bacteroidetes |
| 628 | OTU628 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Rs.M59_termite_group.g__Rs.M59_termite_group.s__uncultured_bacterium |
| 631 | OTU631 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Campylobacteraceae.g__Campylobacter.s__uncultured_Epsilonproteobacteria |
| 657 | OTU657 | d__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__OPB41.f__OPB41.g__OPB41.s__unidentified_marine |
| 663 | OTU663 | d__Bacteria.p__Acidobacteriota.c__Vicinamibacteria.o__Subgroup_9.f__Subgroup_9.g__Subgroup_9.s__uncultured_bacterium |
| 694 | OTU694 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__NS5_marine_group.s__uncultured_Flavobacterium |
| 700 | OTU700 | d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..g__SAR324_clade.Marine_group_B..s__uncultured_organism |
| 706 | OTU706 | d__Bacteria.p__Myxococcota.c__Polyangia.o__UASB.TL25.f__UASB.TL25.gUASB.TL25. |
| 708 | OTU708 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Gracilibacteria.f__Gracilibacteria.g__Gracilibacteria.s__uncultured_organism |
| 714 | OTU714 | d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophales.g__Omnitrophales.s__uncultured_bacterium |
| 729 | OTU729 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_organism |
| 742 | OTU742 | d__Bacteria.p__Planctomycetota.c__Pla4_lineage.o__Pla4_lineage.f__Pla4_lineage.g__Pla4_lineage.s__uncultured_bacterium |
| 750 | OTU750 | d__Bacteria.p__Desulfobacterota.c__Syntrophobacteria.o__Syntrophobacterales.f__uncultured.g__uncultured.s__uncultured_delta |
| 761 | OTU761 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group. |
| 776 | OTU776 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Terasakiellaceae.guncultured. |
| 789 | OTU789 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__Desulfopila.s__uncultured_bacterium |
| 794 | OTU794 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gMarivita. |
| 803 | OTU803 | d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.gAenigmarchaeales. |
| 814 | OTU814 | d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.g__Aenigmarchaeales.s__uncultured_archaeon |
| 823 | OTU823 | d__Bacteria.p__WS2.c__WS2.o__WS2.f__WS2.g__WS2.s__uncultured_Firmicutes |
| 837 | OTU837 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__bacterium_enrichment |
| 840 | OTU840 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__SAR202_clade.f__SAR202_clade.gSAR202_clade. |
| 845 | OTU845 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.gVermiphilaceae. |
| 897 | OTU897 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.gAB.539.J10. |
| 905 | OTU905 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Granulosicoccales.f__Granulosicoccaceae.g__Granulosicoccus.s__uncultured_bacterium |
| 908 | OTU908 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.oOpitutales... |
| 933 | OTU933 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae.g__Sulfurimonas.s__bacterium_enrichment |
| 937 | OTU937 | d__Bacteria.p__Acidobacteriota.c__Acidobacteriae.... |
| 950 | OTU950 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__CH2b56.f__CH2b56.g__CH2b56.s__uncultured_bacterium |
| 954 | OTU954 | d__Bacteria.p__Planctomycetota.c__vadinHA49.o__vadinHA49.f__vadinHA49.g__vadinHA49.s__uncultured_organism |
| 967 | OTU967 | d__Bacteria.p__Planctomycetota.c__Pla3_lineage.o__Pla3_lineage.f__Pla3_lineage.g__Pla3_lineage.s__uncultured_planctomycete |
| 983 | OTU983 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__uncultured.s__uncultured_gamma |
| 987 | OTU987 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__uncultured.s__uncultured_organism |
| 988 | OTU988 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Sphingomonadales.f__Sphingomonadaceae.g__Erythrobacter.s__Erythrobacter_longus |
| 989 | OTU989 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Lewinella.s__uncultured_organism |
| 991 | OTU991 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_bacterium |
| 993 | OTU993 | d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__Sumerlaeales.f__Sumerlaeaceae.g__Sumerlaea.s__uncultured_organism |
| 998 | OTU998 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Flavobacteriaceae.s__Aureicoccus_marinus |
| 1000 | OTU1000 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Synechococcales.f__Cyanobiaceae.gCyanobium_PCC.6307. |
| 1001 | OTU1001 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Paracaedibacterales.f__Paracaedibacteraceae.g__Candidatus_Captivus.s__uncultured_bacterium |
| 1003 | OTU1003 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__NS11.12_marine_group.g__NS11.12_marine_group.s__uncultured_marine |
| 1013 | OTU1013 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__GW2011_GWC1_47_15.gGW2011_GWC1_47_15. |
| 1014 | OTU1014 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oCellvibrionales... |
| 1041 | OTU1041 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__Rubritaleaceae.g__uncultured.s__uncultured_bacterium |
| 1050 | OTU1050 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_bacterium |
| 1057 | OTU1057 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Babeliales.g__Babeliales.s__uncultured_organism |
| 1064 | OTU1064 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.g__Vermiphilaceae.s__uncultured_delta |
| 1070 | OTU1070 | d__Bacteria.p__Cyanobacteria.c__Vampirivibrionia.o__Gastranaerophilales.f__Gastranaerophilales.gGastranaerophilales. |
| 1086 | OTU1086 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRubripirellula. |
| 1097 | OTU1097 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__unidentified_bacterium |
| 1100 | OTU1100 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_soil |
| 1108 | OTU1108 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_prokaryote |
| 1114 | OTU1114 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.g__SS1.B.02.17.s__uncultured_Lentisphaerae |
| 1121 | OTU1121 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Eel.36e1D6.g__Eel.36e1D6.s__uncultured_Polyangiaceae |
| 1127 | OTU1127 | d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_actinobacterium |
| 1129 | OTU1129 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__archaeon_enrichment |
| 1142 | OTU1142 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae.. |
| 1150 | OTU1150 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_gamma |
| 1160 | OTU1160 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Parvularculaceae.gParvularcula. |
| 1162 | OTU1162 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gRoseobacter. |
| 1173 | OTU1173 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.g__uncultured.s__bacterium_WH6.7 |
| 1178 | OTU1178 | d__Bacteria.p__Firmicutes.c__Bacilli.o__Izemoplasmatales.f__Izemoplasmataceae.g__Izimaplasma.s__Candidatus_Izimaplasma |
| 1189 | OTU1189 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfospira.s__uncultured_bacterium |
| 1194 | OTU1194 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Saccharospirillaceae.gReinekea. |
| 1196 | OTU1196 | d__Bacteria.p__CK.2C2.2.c__CK.2C2.2.o__CK.2C2.2.f__CK.2C2.2.g__CK.2C2.2.s__uncultured_organism |
| 1198 | OTU1198 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Cryomorphaceae.g__Vicingus.s__uncultured_sediment |
| 1205 | OTU1205 | d__Bacteria.p__Chloroflexi.c__Chloroflexia.oThermomicrobiales... |
| 1226 | OTU1226 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.gFilomicrobium. |
| 1228 | OTU1228 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__Rhodopirellula.s__uncultured_hydrocarbon |
| 1235 | OTU1235 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.g__uncultured.s__uncultured_organism |
| 1238 | OTU1238 | d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__Subgroup_23.s__uncultured_prokaryote |
| 1240 | OTU1240 | d__Bacteria.p__Myxococcota.c__Polyangia.o__VHS.B4.70.f__VHS.B4.70.g__VHS.B4.70.s__uncultured_delta |
| 1248 | OTU1248 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.g__mle1.8.s__uncultured_organism |
| 1249 | OTU1249 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.gSandaracinus. |
| 1250 | OTU1250 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.gCI75cm.2.12. |
| 1284 | OTU1284 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.g__JTB255_marine_benthic_group.s__uncultured_organism |
| 1285 | OTU1285 | d__Bacteria.p__Fibrobacterota.c__Fibrobacteria.o__Fibrobacterales.f__TG3.gTG3. |
| 1287 | OTU1287 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__OM182_clade.f__OM182_clade.g__OM182_clade.s__unidentified_marine |
| 1289 | OTU1289 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.gSS1.B.02.17. |
| 1298 | OTU1298 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Bacteroidetes |
| 1299 | OTU1299 | d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.g__R76.B128.s__Kiritimatiella_sp. |
| 1301 | OTU1301 | d__Bacteria.p__Fusobacteriota.c__Fusobacteriia.o__Fusobacteriales.f__Fusobacteriaceae.gFusobacterium. |
| 1302 | OTU1302 | d__Archaea.p__Asgardarchaeota.c__Lokiarchaeia.o__Lokiarchaeia.f__Lokiarchaeia.g__Lokiarchaeia.s__uncultured_crenarchaeote |
| 1303 | OTU1303 | d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_bacterium |
| 1308 | OTU1308 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.gmle1.8. |
| 1310 | OTU1310 | d__Bacteria.p__Spirochaetota.c__MVP.15.o__MVP.15.f__MVP.15.g__MVP.15.s__uncultured_bacterium |
| 1312 | OTU1312 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurospirillaceae.gSulfurospirillum. |
| 1314 | OTU1314 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.g__SZB85.s__uncultured_bacterium |
| 1318 | OTU1318 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Methanomassiliicoccales.f__uncultured.g__uncultured.s__uncultured_euryarchaeote |
| 1321 | OTU1321 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.g__Chroococcidiopsis_PCC.6712.s__uncultured_cyanobacterium |
| 1325 | OTU1325 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__uncultured.f__uncultured.guncultured. |
| 1329 | OTU1329 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema. |
| 1330 | OTU1330 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.. |
| 1338 | OTU1338 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Blfdi19.f__Blfdi19.g__Blfdi19.s__uncultured_bacterium |
| 1343 | OTU1343 | d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__B276.D12.s__uncultured_bacterium |
| 1347 | OTU1347 | d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__uncultured.f__uncultured.guncultured. |
| 1350 | OTU1350 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gPolaribacter. |
| 1351 | OTU1351 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__AKAU3564_sediment_group.g__AKAU3564_sediment_group.s__uncultured_bacterium |
| 1352 | OTU1352 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__D90.f__D90.g__D90.s__uncultured_gamma |
| 1353 | OTU1353 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Amsterdam.1B.07.f__Amsterdam.1B.07.g__Amsterdam.1B.07.s__uncultured_bacterium |
| 1359 | OTU1359 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__bacterium_SH4.10 |
| 1361 | OTU1361 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.gPelolinea. |
| 1362 | OTU1362 | d__Bacteria.p__Gemmatimonadota.c__BD2.11_terrestrial_group.o__BD2.11_terrestrial_group.f__BD2.11_terrestrial_group.g__BD2.11_terrestrial_group.s__unidentified_bacterium |
| 1363 | OTU1363 | d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_organism |
| 1369 | OTU1369 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__Phycisphaeraceae.g__uncultured.s__uncultured_bacterium |
| 1370 | OTU1370 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Acidobacterium |
| 1374 | OTU1374 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Defluviicoccales.f__Defluviicoccaceae.g__Defluviicoccus.s__uncultured_Rhodospirillaceae |
The separation between groups is 3.12.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR.max | 549 | 2.829712 |
| LR.max.local | 46 | 4.655234 |
| Aldex | 121 | 3.429094 |
| glmnet | 350 | 2.776603 |
| glmnet.ext | 104 | 4.421816 |
| intersection | 312 | 3.116542 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[7:12,]
DF.0=DF.0[7:12,]
DF.0.original=DF.0.original[7:12 ,]
Type=as.factor(c(rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors=c("purple","red")[Type]
After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 1682 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions | |
|---|---|---|---|---|
| 2 | CAU_T0_2 | OTU966 | 55 | 0.0012193 |
| 1 | CAU_T2HW_1 | OTU727 | 55 | 0.0008798 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are NOT well separated.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
# Fijo una semilla de aleatoriedad para el experimento
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1715851715853234
initial_seed %% 10000
# [1] 3234
set.seed(3234)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:1000)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T2CvsT2HW.RData")
The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CAU_T2C and CAU_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T2C and CAU_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). Note that from \(n=350\) onward, NO subset is found that separates the two groups.
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_2C2HW.RData")
LRS=readRDS("LRS_CAU_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAU_2C2HW.RData")
assoc=readRDS("assoc_CAU_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 95 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 1.98.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences, likely due to the low power with only 3 vs. 3 samples. When this happens, we add random replicates, with the idea that stochastic resonance may strengthen the signal. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711435693
initial_seed %% 10000
# [1] 5693
# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 42 significant taxa are found, perfectly separating the two groups, with a separation of 1.49.
# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 53 significant taxa are found, perfectly separating the two groups, with a separation of 1.2.
Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 59 significant taxa are found, perfectly separating the two groups, with a separation of 1.12.
Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711461735
initial_seed %% 10000
# [1] 1735
set.seed(1735)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_CAU_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_CAU_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_CAU_2C2HW.RData")
min(p.valores_kw[,1])
## [1] 0.04999719
min(p.valores_kw[,2])
## [1] 0.3641574
min(p.valores_kw[,3])
## [1] 0.002010094
min(p.valores_kw[,4])
## [1] 0.07355742
length(p.valores_kw[p.valores_kw[,3]<0.01,2])
## [1] 11
length(p.valores_kw[p.valores_kw[,3]<0.05,2])
## [1] 68
There are no taxa with even an unadjusted p-value < 0.05.
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]
OTUs_SMP.07=sort(attr(SMP.prop$CAU_T2C_CAU_T2HW$cusum[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good.07=setdiff(OTUs_SMP.07,Unics$OTUs)
simper identifies 170 significant taxa, but they do not separate the samples well.
HC=ClusterHC(DF[,OTUs_SMP.good.07],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAU_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 10 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU38 | OTU38 | Unassigned;;;;;; |
| OTU108 | OTU108 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gGWE2-31-10; |
| OTU199 | OTU199 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;gCobetia; |
| OTU208 | OTU208 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gLutibacter; |
| OTU337 | OTU337 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SJA-15;f__SJA-15;g__SJA-15;s__uncultured_Chloroflexi |
| OTU365 | OTU365 | d__Bacteria;p__Firmicutes;c__Clostridia;oLachnospirales;;; |
| OTU383 | OTU383 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Alteromonadaceae;; |
| OTU685 | OTU685 | d__Bacteria;p__PAUC34f;c__PAUC34f;o__PAUC34f;f__PAUC34f;g__PAUC34f;s__uncultured_bacterium |
| OTU708 | OTU708 | d__Bacteria;p__Patescibacteria;c__Gracilibacteria;o__Gracilibacteria;f__Gracilibacteria;g__Gracilibacteria;s__uncultured_organism |
| OTU1120 | OTU1120 | d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;gCaldithrix; |
The separation between groups is 1.96.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 95 | 1.981986 |
| glmnet.ext.3 | 42 | 1.492428 |
| glmnet.ext.5 | 53 | 1.203924 |
| glmnet.ext.10 | 59 | 1.120839 |
| intersection | 10 | 1.960728 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
indices=1:n.OTUs.original
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
Miseria=as.vector(which(colSums(DF.0)<=2|colSums(DF.0.hits)==1))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
indices=indices[-Miseria]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF.0=DF.0[7:12,]
DF=DF[7:12,]
DF.0.original=DF.0.original[7:12 ,]
Type=as.factor(c(rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors=c("purple","red")[Type]
After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 2247 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CYM_T2C_1 | OTU2121 | 186 | 0.0020402 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 1.21.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1714640507
initial_seed %% 10000
# [1] 507
set.seed(507)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:1000)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T2CvsT2HW.RData")
The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CYM_T2C and CYM_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T2C and CYM_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=base,col="green")
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_2C2HW.RData")
LRS=readRDS("LRS_CYM_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CYM_2C2HW.RData")
assoc=readRDS("assoc_CYM_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 563 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.15.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR)
)
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 81 significant taxa are found, perfectly separating the two groups, with a separation of 3.67.
Significant=cbind(Significant,
glmnet_sign.ext.3.T2C=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T2HW=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:30,37:42),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 84 significant taxa are found, perfectly separating the two groups, with a separation of 3.4.
Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_1,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:60,67:72),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 82 significant taxa are found, perfectly separating the two groups, with a separation of 3.09.
Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711815195
initial_seed %% 10000
# [1] 5195
set.seed(5195)
repl_kw=aldexCesc.clr(t(DF), Type=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_2C2HW.RData")
length(p.valores_kw[p.valores_kw[,1]<0.01,1])
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,1])
## [1] 29
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=TRUE)
There are no taxa with a Kruskal–Wallis adjusted p-value < 0.05. There are 29 with an unadjusted p-value < 0.05 (none < 0.01); in the absence of a better option, we select them. They perfectly separate the two groups, with a separation of 3.11.
Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T2C_CYM_T2HW$ord[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T2C_CYM_T2HW$cusum[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
simper.props=c(length(indexos_SMP),Sep(HC))
We obtain 120 significant taxa, which perfectly separate the two sample types.They include OTU2121, which we had previously removed because it appeared in only one sample. The separation between groups is 1.42.
Indicador.Gl=function(x,k=n.OTUs.original){
xx=rep(0,k)
xx[x]=1
return(xx)
}
Significant=cbind(Significant,
simper.rel=Indicador.Gl(indexos_SMP))
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CYM_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10,
sep.kw.noadjust.0.05,
simper.props
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,6,20),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 60 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU4 | OTU4 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Robiginitalea;s__bacterium_AMSU |
| OTU6 | OTU6 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Maritimimonas;s__uncultured_Bacteroidetes |
| OTU19 | OTU19 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;guncultured; |
| OTU25 | OTU25 | d__Archaea;p__Asgardarchaeota;c__Heimdallarchaeia;o__Heimdallarchaeia;f__Heimdallarchaeia;g__Heimdallarchaeia;s__uncultured_archaeon |
| OTU26 | OTU26 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SBR1031;f__SBR1031;g__SBR1031;s__uncultured_organism |
| OTU27 | OTU27 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Ulvibacter;s__Aureitalea_sp. |
| OTU31 | OTU31 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;oEctothiorhodospirales;;; |
| OTU34 | OTU34 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gMaribacter; |
| OTU48 | OTU48 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;;;; |
| OTU56 | OTU56 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Arenicellales;f__Arenicellaceae;guncultured; |
| OTU78 | OTU78 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gSpirochaeta; |
| OTU85 | OTU85 | d__Bacteria;pChloroflexi;;;;; |
| OTU94 | OTU94 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;g__Spirochaeta;s__uncultured_bacterium |
| OTU102 | OTU102 | d__Bacteria;p__Bacteroidota;c__Ignavibacteria;o__Ignavibacteriales;f__Melioribacteraceae;g__IheB3-7;s__uncultured_Chlorobi |
| OTU107 | OTU107 | d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;gWCHB1-81; |
| OTU118 | OTU118 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;g__Marine_Benthic_Group_D_and_DHVEG-1;s__uncultured_archaeon |
| OTU123 | OTU123 | d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;gPir4_lineage; |
| OTU126 | OTU126 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;guncultured; |
| OTU133 | OTU133 | d__Bacteria;p__Desulfobacterota;c__Desulfobacteria;o__Desulfatiglandales;f__Desulfatiglandaceae;gDesulfatiglans; |
| OTU136 | OTU136 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_bacterium |
| OTU137 | OTU137 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;gMarine_Benthic_Group_D_and_DHVEG-1; |
| OTU143 | OTU143 | d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__Bathyarchaeia;f__Bathyarchaeia;gBathyarchaeia; |
| OTU144 | OTU144 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gAquibacter; |
| OTU163 | OTU163 | d__Bacteria;p__Modulibacteria;c__Moduliflexia;o__Moduliflexales;f__Moduliflexaceae;g__Moduliflexaceae;s__uncultured_bacterium |
| OTU168 | OTU168 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;gendosymbionts; |
| OTU174 | OTU174 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__SB-5;g__SB-5;s__bacterium_YC-LK-LKJ31 |
| OTU178 | OTU178 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gWinogradskyella; |
| OTU181 | OTU181 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;;;; |
| OTU182 | OTU182 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__uncultured;f__uncultured;guncultured; |
| OTU186 | OTU186 | d__Bacteria;p__LCP-89;c__LCP-89;o__LCP-89;f__LCP-89;g__LCP-89;s__uncultured_bacterium |
| OTU200 | OTU200 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;; |
| OTU212 | OTU212 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_bacterium |
| OTU216 | OTU216 | d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;g__WCHB1-81;s__uncultured_bacterium |
| OTU224 | OTU224 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Sedimenticolaceae;g__uncultured;s__uncultured_bacterium |
| OTU241 | OTU241 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;g__uncultured;s__uncultured_Chloroflexi |
| OTU283 | OTU283 | d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;; |
| OTU296 | OTU296 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_prokaryote |
| OTU309 | OTU309 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;gLewinella; |
| OTU329 | OTU329 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Aquibacter;s__uncultured_marine |
| OTU347 | OTU347 | d__Bacteria;p__Chloroflexi;c__Dehalococcoidia;;;; |
| OTU372 | OTU372 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Pseudohongiellaceae;gPseudohongiella; |
| OTU412 | OTU412 | d__Bacteria;p__Desulfobacterota;c__Desulfobulbia;oDesulfobulbales;;; |
| OTU426 | OTU426 | d__Bacteria;p__Verrucomicrobiota;c__Omnitrophia;o__Omnitrophales;f__Omnitrophaceae;g__Candidatus_Omnitrophus;s__uncultured_bacterium |
| OTU433 | OTU433 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Lewinella;s__uncultured_marine |
| OTU490 | OTU490 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;gPeredibacter; |
| OTU516 | OTU516 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bdellovibrionales;f__Bdellovibrionaceae;gBdellovibrio; |
| OTU565 | OTU565 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Portibacter;s__uncultured_Bacteroidetes |
| OTU574 | OTU574 | d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;g__uncultured;s__uncultured_organism |
| OTU597 | OTU597 | d__Bacteria;p__Acidobacteriota;c__Thermoanaerobaculia;o__Thermoanaerobaculales;f__Thermoanaerobaculaceae;g__Subgroup_23;s__uncultured_bacterium |
| OTU622 | OTU622 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__uncultured;g__uncultured;s__uncultured_Bacteroidetes |
| OTU625 | OTU625 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__pItb-vmat-80;f__pItb-vmat-80;gpItb-vmat-80; |
| OTU659 | OTU659 | d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__Mitochondria;g__Mitochondria;s__uncultured_bacterium |
| OTU785 | OTU785 | d__Archaea;p__Asgardarchaeota;c__Odinarchaeia;o__Odinarchaeia;f__Odinarchaeia;g__Odinarchaeia;s__uncultured_crenarchaeote |
| OTU854 | OTU854 | d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__Puniceicoccaceae;gLentimonas; |
| OTU980 | OTU980 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Phaeodactylibacter;s__uncultured_organism |
| OTU1088 | OTU1088 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;gCongregibacter; |
| OTU1502 | OTU1502 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmatota;o__Thermoplasmatota;f__Thermoplasmatota;guncultured; |
| OTU1532 | OTU1532 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Chromatiaceae;g__Candidatus_Thiobios;s__uncultured_organism |
| OTU1539 | OTU1539 | d__Bacteria;p__Acidobacteriota;c__Vicinamibacteria;o__Subgroup_17;f__Subgroup_17;gSubgroup_17; |
| OTU2370 | OTU2370 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_deep-sea |
The separation between groups is 4.31.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper",
"Intersection")
colnames(Resultados)=c("Size","Separation")
plot(F.SS[-1,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,150),xaxp=c(0,150,15),yaxp=c(0,6,12),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados[-dim(Resultados)[1],],col="blue",type="h")
points(Resultados[-dim(Resultados)[1],],col="blue",pch=20)
points(intersección[1],intersección[2],col="purple",type="h")
points(intersección[1],intersección[2],col="purple",pch=20)
text(Resultados[-dim(Resultados)[1],1],Resultados[-dim(Resultados)[1],2]+0.1,col="blue",labels=row.names(Resultados)[-dim(Resultados)[1]],cex=0.75)
text(intersección[1],intersección[2]+0.1,col="purple",labels="intersección",cex=0.75)
abline(h=base,col="green")
text(20,base+0.1,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 563 | 2.150788 |
| glmnet.ext.3 | 81 | 3.668474 |
| glmnet.ext.5 | 84 | 3.399211 |
| glmnet.ext.10 | 82 | 3.085298 |
| Aldex | 29 | 3.114325 |
| simper | 120 | 1.415174 |
| Intersection | 60 | 4.306244 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]
Type=as.factor(c(rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors=c("blue","green")[Type]
After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1194 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012226 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are not initially separated.
We now examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717749309
initial_seed %% 10000
# [1] 9309
set.seed(9309)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:600)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T0vsT1.RData")
The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CAU_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CAU_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_T0T1.RData")
LRS=readRDS("LRS_CAU_T0T1.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 24 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.3.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717764843
initial_seed %% 10000
# [1] 4843
# 3 replicates
set.seed(4843)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 45 significant taxa are found, perfectly separating the two groups, with a separation of 0.95.
Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(4843)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 51 significant taxa are found, perfectly separating the two groups, with a separation of 0.88.
Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(4843)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 49 significant taxa are found, but they do not classify well the samples.
Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717772151
initial_seed %% 10000
# [1] 2151
set.seed(2151)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(aldex_kw, file="aldex_kw_CAU_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CAU_T0T1.RData")
length(p.valores_kw[p.valores_kw[,1]<0.05,2])
## [1] 0
There are no taxa with a p-value < 0.05, even without adjustment.
The significant taxa identified by simper do not classify the samples well:
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CAU_T0_CAU_T1$ord[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CAU_T0_CAU_T1$cusum[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAU_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU952 | OTU952 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Beggiatoales;f__Beggiatoaceae;g__Thioflexothrix;s__Thiotrichales_bacterium |
A single OTU. The same happens with 5 replicates:
Significant.capat=Significant[,c(1,2,3,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU828 | OTU828 | d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Rubritaleaceae;g__Roseibacillus;s__uncultured_organism |
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 24 | 2.2969800 |
| glmnet.ext.3 | 45 | 0.9462113 |
| glmnet.ext.5 | 51 | 0.8765157 |
| glmnet.ext.10 | 49 | 0.8640208 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]
Type=as.factor(c(rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors=c("blue","green")[Type]
After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1658 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are not initially separated.
We now examine the proportion of OTU subsets of reasonable size that classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717668665
initial_seed %% 10000
# [1] 8665
set.seed(8665)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:650) #10
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T0vsT1.RData")
The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CYM_T0 and CYM_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T0 and CYM_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_T0T1.RData")
LRS=readRDS("LRS_CYM_T0T1.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 169 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.46.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717709647
initial_seed %% 10000
# [1] 9647
# 3 replicates
set.seed(9647)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 62 significant taxa are found, perfectly separating the two groups, with a separation of 2.05.
Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(9647)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 74 significant taxa are found, perfectly separating the two groups, with a separation of 1.36.
Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(9647)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 67 significant taxa are found, perfectly separating the two groups, with a separation of 1.26.
Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717738742
initial_seed %% 10000
# [1] 8742
set.seed(8742)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
repl_glm=aldexCesc.clr(t(DF), conds=model.matrix(~Type,data.frame(Type)), mc.samples=1000, verbose=FALSE)
aldex_glm=aldex.glm(repl_glm, model.matrix(~Type,data.frame(Type)))
#'
saveRDS(aldex_kw, file="aldex_kw_CYM_T0T1.RData")
saveRDS(aldex_glm, file="aldex_glm_CYM_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CYM_T0T1.RData")
length(p.valores_kw[p.valores_kw[,2]<0.05,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,4]<0.05,2])
## [1] 88
length(p.valores_kw[p.valores_kw[,4]<0.01,4])
## [1] 6
There are no OTUs with an adjusted Kruskal–Wallis p-value < 0.05. There are 66 OTUs with an unadjusted p-value < 0.05, but they do not separate the samples well:
sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)
Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))
The significant taxa identified by simper do not classify the samples well:
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T0_CYM_T1$ord[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T0_CYM_T1$cusum[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CYM_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10"
)
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates.
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 1 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU235 | OTU235 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;g__endosymbionts;s__uncultured_bacterium |
The separation between groups is 0.98.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()